{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9223390518354547, "eval_steps": 5000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 646.5, "completions/mean_terminated_length": 646.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.00018446781036709093, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 9556.0, "reward": 1.3977272510528564, "reward_std": 0.7336369156837463, "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.42362356185913086, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.00036893562073418186, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.0, "learning_rate": 1.8433179723502305e-08, "loss": -0.0, "num_tokens": 17493.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.0005534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.00016184583455469692, "learning_rate": 3.686635944700461e-08, "loss": 0.0, "num_tokens": 26416.0, "reward": 1.462499976158142, "reward_std": 0.329712450504303, "rewards/fixed_code_pass_all_test_reward/mean": 0.5874999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.318696528673172, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.0007378712414683637, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0001465609470869822, "learning_rate": 5.529953917050692e-08, "loss": 0.0, "num_tokens": 35517.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 396.25, "completions/mean_terminated_length": 396.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.0009223390518354548, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.00013932313049735967, "learning_rate": 7.373271889400922e-08, "loss": 0.0, "num_tokens": 42935.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 602.75, "completions/mean_terminated_length": 602.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.0011068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.00022564510436495766, "learning_rate": 9.216589861751152e-08, "loss": 0.0, "num_tokens": 54309.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 499.125, "completions/mean_terminated_length": 499.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0012912746725696365, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0002626226232678164, "learning_rate": 1.1059907834101384e-07, "loss": 0.0, "num_tokens": 63726.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0014757424829367274, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.00026422405971970875, "learning_rate": 1.2903225806451614e-07, "loss": 0.0, "num_tokens": 69214.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.0016602102933038186, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.00015682746834499994, "learning_rate": 1.4746543778801844e-07, "loss": 0.0, "num_tokens": 79675.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 434.625, "completions/mean_terminated_length": 434.625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.0018446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.00019636107299447758, "learning_rate": 1.6589861751152077e-07, "loss": 0.0, "num_tokens": 87800.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.0020291459140380002, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.00018017001002590405, "learning_rate": 1.8433179723502305e-07, "loss": 0.0, "num_tokens": 91860.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 539.25, "completions/mean_terminated_length": 539.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.002213613724405091, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.00024090245096886065, "learning_rate": 2.0276497695852537e-07, "loss": 0.0, "num_tokens": 105910.0, "reward": 1.0625, "reward_std": 0.6813851594924927, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.32732683420181274, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.002398081534772182, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.00023027175666356925, "learning_rate": 2.2119815668202768e-07, "loss": 0.0, "num_tokens": 112539.0, "reward": 0.776442289352417, "reward_std": 0.4996762275695801, "rewards/fixed_code_pass_all_test_reward/mean": 0.401442289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.2804637849330902, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.002582549345139273, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0002332902204216225, "learning_rate": 2.3963133640553e-07, "loss": 0.0, "num_tokens": 120241.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 399.25, "completions/mean_terminated_length": 399.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.002767017155506364, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.00019597944447014015, "learning_rate": 2.580645161290323e-07, "loss": 0.0, "num_tokens": 129275.0, "reward": 1.5431817770004272, "reward_std": 0.4559911787509918, "rewards/fixed_code_pass_all_test_reward/mean": 0.668181836605072, "rewards/fixed_code_pass_all_test_reward/std": 0.4215841293334961, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.002951484965873455, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.00018851919548978913, "learning_rate": 2.764976958525346e-07, "loss": 0.0, "num_tokens": 134441.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 604.375, "completions/mean_terminated_length": 604.375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.003135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.00017959067736228462, "learning_rate": 2.949308755760369e-07, "loss": 0.0, "num_tokens": 144380.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 376.125, "completions/mean_terminated_length": 376.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.003320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0002442944605718367, "learning_rate": 3.133640552995392e-07, "loss": 0.0, "num_tokens": 151581.0, "reward": 0.7074999809265137, "reward_std": 0.6952028870582581, "rewards/fixed_code_pass_all_test_reward/mean": 0.20749999582767487, "rewards/fixed_code_pass_all_test_reward/std": 0.3301839530467987, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.003504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.00024068942002486438, "learning_rate": 3.3179723502304154e-07, "loss": 0.0, "num_tokens": 156697.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 677.375, "completions/mean_terminated_length": 677.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.003689356207341819, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.00015377472118416335, "learning_rate": 3.5023041474654376e-07, "loss": 0.0, "num_tokens": 172716.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.00387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.00029984075081301853, "learning_rate": 3.686635944700461e-07, "loss": 0.0, "num_tokens": 177988.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 619.0, "completions/mean_terminated_length": 619.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0040582918280760005, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.00015745200380479218, "learning_rate": 3.870967741935484e-07, "loss": 0.0, "num_tokens": 192316.0, "reward": 0.8888888359069824, "reward_std": 0.3596327006816864, "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, "rewards/fixed_code_pass_all_test_reward/std": 0.019168488681316376, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 539.375, "completions/mean_terminated_length": 539.375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.004242759638443091, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.0001711196609903709, "learning_rate": 4.0552995391705075e-07, "loss": 0.0, "num_tokens": 201975.0, "reward": 0.5089285969734192, "reward_std": 0.5364790558815002, "rewards/fixed_code_pass_all_test_reward/mean": 0.008928571827709675, "rewards/fixed_code_pass_all_test_reward/std": 0.007393559440970421, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 565.625, "completions/mean_terminated_length": 565.625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.004427227448810182, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.0002451626151014352, "learning_rate": 4.23963133640553e-07, "loss": 0.0, "num_tokens": 217588.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 597.875, "completions/mean_terminated_length": 597.875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.004611695259177273, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.00022689109391649254, "learning_rate": 4.4239631336405535e-07, "loss": 0.0, "num_tokens": 232307.0, "reward": 1.3095238208770752, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.4345238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.1912188082933426, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.004796163069544364, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.00022167046336107887, "learning_rate": 4.608294930875577e-07, "loss": 0.0, "num_tokens": 240109.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.004980630879911455, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.0002217927240053541, "learning_rate": 4.7926267281106e-07, "loss": 0.0, "num_tokens": 244777.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 557.0, "completions/mean_terminated_length": 557.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.005165098690278546, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.0001640177094941464, "learning_rate": 4.976958525345623e-07, "loss": 0.0, "num_tokens": 255409.0, "reward": 1.5416667461395264, "reward_std": 0.8275063037872314, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.374898761510849, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 486.25, "completions/mean_terminated_length": 486.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.005349566500645637, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.00023256600616150536, "learning_rate": 5.161290322580646e-07, "loss": 0.0, "num_tokens": 266307.0, "reward": 1.262195110321045, "reward_std": 0.5335278511047363, "rewards/fixed_code_pass_all_test_reward/mean": 0.3871951103210449, "rewards/fixed_code_pass_all_test_reward/std": 0.3236880600452423, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 458.5, "completions/mean_terminated_length": 458.5, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.005534034311012728, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.00022078821348259225, "learning_rate": 5.345622119815668e-07, "loss": 0.0, "num_tokens": 274935.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1012.5, "completions/mean_terminated_length": 667.3333740234375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.005718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.00015209946104732808, "learning_rate": 5.529953917050692e-07, "loss": 0.0, "num_tokens": 289515.0, "reward": 1.02734375, "reward_std": 0.6355004906654358, "rewards/fixed_code_pass_all_test_reward/mean": 0.40234375, "rewards/fixed_code_pass_all_test_reward/std": 0.3611418306827545, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 415.875, "completions/mean_terminated_length": 415.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.00590296993174691, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.00021360143909987528, "learning_rate": 5.714285714285715e-07, "loss": 0.0, "num_tokens": 296794.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 450.75, "completions/mean_terminated_length": 450.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.006087437742114001, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.00021438846397359157, "learning_rate": 5.898617511520738e-07, "loss": 0.0, "num_tokens": 307232.0, "reward": 0.8928571343421936, "reward_std": 0.8390957117080688, "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.006271905552481092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034027099609375, "kl": 0.00017366967131238198, "learning_rate": 6.082949308755762e-07, "loss": 0.0, "num_tokens": 313486.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 599.875, "completions/mean_terminated_length": 599.875, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.006456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.00015820199132576818, "learning_rate": 6.267281105990784e-07, "loss": 0.0, "num_tokens": 322501.0, "reward": 1.6428571939468384, "reward_std": 0.38936299085617065, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.10628911107778549, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 427.125, "completions/mean_terminated_length": 427.125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.006640841173215274, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0001587776869200752, "learning_rate": 6.451612903225807e-07, "loss": 0.0, "num_tokens": 334158.0, "reward": 1.4083333015441895, "reward_std": 0.7391006946563721, "rewards/fixed_code_pass_all_test_reward/mean": 0.6583333611488342, "rewards/fixed_code_pass_all_test_reward/std": 0.3215784728527069, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 553.875, "completions/mean_terminated_length": 553.875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.006825308983582365, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.00016334438805643003, "learning_rate": 6.635944700460831e-07, "loss": 0.0, "num_tokens": 345949.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 600.75, "completions/mean_terminated_length": 600.75, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.007009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.00028653825029323343, "learning_rate": 6.820276497695854e-07, "loss": 0.0, "num_tokens": 355195.0, "reward": 0.8265306353569031, "reward_std": 0.5420389175415039, "rewards/fixed_code_pass_all_test_reward/mean": 0.3265306353569031, "rewards/fixed_code_pass_all_test_reward/std": 0.20755070447921753, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 793.625, "completions/mean_terminated_length": 793.625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.007194244604316547, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0002119701912306482, "learning_rate": 7.004608294930875e-07, "loss": 0.0, "num_tokens": 366856.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 501.75, "completions/mean_terminated_length": 501.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.007378712414683638, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0001861243126768386, "learning_rate": 7.1889400921659e-07, "loss": 0.0, "num_tokens": 375366.0, "reward": 0.699999988079071, "reward_std": 0.6141195893287659, "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.3195979595184326, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 438.125, "completions/mean_terminated_length": 438.125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.007563180225050729, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0002582957795311813, "learning_rate": 7.373271889400922e-07, "loss": 0.0, "num_tokens": 383559.0, "reward": 0.83984375, "reward_std": 0.4687314033508301, "rewards/fixed_code_pass_all_test_reward/mean": 0.08984375, "rewards/fixed_code_pass_all_test_reward/std": 0.08749601244926453, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.00774764803541782, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0002460242722008843, "learning_rate": 7.557603686635945e-07, "loss": 0.0, "num_tokens": 392973.0, "reward": 1.1982758045196533, "reward_std": 0.8355019688606262, "rewards/fixed_code_pass_all_test_reward/mean": 0.4482758641242981, "rewards/fixed_code_pass_all_test_reward/std": 0.4770955741405487, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.00793211584578491, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.00019570136464608368, "learning_rate": 7.741935483870968e-07, "loss": 0.0, "num_tokens": 401985.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.008116583656152001, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.0001460007972582389, "learning_rate": 7.926267281105991e-07, "loss": 0.0, "num_tokens": 407596.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 424.375, "completions/mean_terminated_length": 424.375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.008301051466519093, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.00017994168865698157, "learning_rate": 8.110599078341015e-07, "loss": 0.0, "num_tokens": 415879.0, "reward": 0.8928571343421936, "reward_std": 0.6468132138252258, "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 369.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.008485519276886183, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0001759659353410825, "learning_rate": 8.294930875576038e-07, "loss": 0.0, "num_tokens": 422768.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 696.375, "completions/mean_terminated_length": 696.375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.008669987087253275, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.00017174023378174752, "learning_rate": 8.47926267281106e-07, "loss": 0.0, "num_tokens": 434227.0, "reward": 1.4757652282714844, "reward_std": 0.46155646443367004, "rewards/fixed_code_pass_all_test_reward/mean": 0.7257653474807739, "rewards/fixed_code_pass_all_test_reward/std": 0.27170121669769287, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.008854454897620365, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.00018270496093464317, "learning_rate": 8.663594470046084e-07, "loss": 0.0, "num_tokens": 445524.0, "reward": 0.9421296715736389, "reward_std": 0.3907661437988281, "rewards/fixed_code_pass_all_test_reward/mean": 0.06712962687015533, "rewards/fixed_code_pass_all_test_reward/std": 0.09229449927806854, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.009038922707987456, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "kl": 0.00014784874838369433, "learning_rate": 8.847926267281107e-07, "loss": 0.0, "num_tokens": 450820.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 656.0, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.009223390518354546, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.0001441684862584225, "learning_rate": 9.032258064516129e-07, "loss": 0.0, "num_tokens": 461076.0, "reward": 1.0, "reward_std": 0.6172134280204773, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 471.875, "completions/mean_terminated_length": 471.875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.009407858328721638, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.00013033490677116788, "learning_rate": 9.216589861751154e-07, "loss": 0.0, "num_tokens": 470219.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 551.5, "completions/mean_terminated_length": 551.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.009592326139088728, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.00011888523840752896, "learning_rate": 9.400921658986175e-07, "loss": 0.0, "num_tokens": 481007.0, "reward": 1.0029070377349854, "reward_std": 0.53458571434021, "rewards/fixed_code_pass_all_test_reward/mean": 0.12790697813034058, "rewards/fixed_code_pass_all_test_reward/std": 0.3524727523326874, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.00977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.0001700549510132987, "learning_rate": 9.5852534562212e-07, "loss": 0.0, "num_tokens": 486060.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 519.625, "completions/mean_terminated_length": 519.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.00996126175982291, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.00022266689484240487, "learning_rate": 9.769585253456222e-07, "loss": 0.0, "num_tokens": 496705.0, "reward": 0.9015151262283325, "reward_std": 0.5400769114494324, "rewards/fixed_code_pass_all_test_reward/mean": 0.1515151560306549, "rewards/fixed_code_pass_all_test_reward/std": 0.24511492252349854, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 676.0, "completions/mean_terminated_length": 676.0, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.010145729570190002, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.00014531454917232622, "learning_rate": 9.953917050691246e-07, "loss": 0.0, "num_tokens": 511441.0, "reward": 0.9711538553237915, "reward_std": 0.5999965071678162, "rewards/fixed_code_pass_all_test_reward/mean": 0.2211538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.1390555202960968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 654.5, "completions/mean_terminated_length": 455.4285888671875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.010330197380557092, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.00017269806494368822, "learning_rate": 1.013824884792627e-06, "loss": 0.0, "num_tokens": 523501.0, "reward": 1.139925479888916, "reward_std": 0.6464568972587585, "rewards/fixed_code_pass_all_test_reward/mean": 0.5149253606796265, "rewards/fixed_code_pass_all_test_reward/std": 0.31781885027885437, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 442.0, "completions/mean_terminated_length": 442.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.010514665190924184, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.00022005929895385634, "learning_rate": 1.0322580645161291e-06, "loss": 0.0, "num_tokens": 534421.0, "reward": 1.4196429252624512, "reward_std": 0.5780074000358582, "rewards/fixed_code_pass_all_test_reward/mean": 0.6696428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3534245193004608, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.010699133001291274, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.00013272725936985807, "learning_rate": 1.0506912442396313e-06, "loss": 0.0, "num_tokens": 538620.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.010883600811658366, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.00011469083437987138, "learning_rate": 1.0691244239631337e-06, "loss": 0.0, "num_tokens": 543435.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 698.75, "completions/mean_terminated_length": 249.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.011068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.00024947148813225795, "learning_rate": 1.087557603686636e-06, "loss": 0.0, "num_tokens": 552041.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 552.875, "completions/mean_terminated_length": 552.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.011252536432392548, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0002478203441569349, "learning_rate": 1.1059907834101384e-06, "loss": 0.0, "num_tokens": 561000.0, "reward": 0.8541666865348816, "reward_std": 0.6135863065719604, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4833538830280304, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.011437004242759638, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.00011564837814148632, "learning_rate": 1.1244239631336406e-06, "loss": 0.0, "num_tokens": 565241.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 347.5, "completions/mean_terminated_length": 347.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.01162147205312673, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.00023581041023135185, "learning_rate": 1.142857142857143e-06, "loss": 0.0, "num_tokens": 571197.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 323.0, "completions/mean_terminated_length": 323.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.01180593986349382, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "kl": 0.000263547352005844, "learning_rate": 1.1612903225806454e-06, "loss": 0.0, "num_tokens": 581405.0, "reward": 0.8928571343421936, "reward_std": 0.3642157018184662, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.011990407673860911, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.0001780624866114522, "learning_rate": 1.1797235023041475e-06, "loss": 0.0, "num_tokens": 588253.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.012174875484228001, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.00018533837464929093, "learning_rate": 1.19815668202765e-06, "loss": 0.0, "num_tokens": 592787.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 692.375, "completions/mean_terminated_length": 692.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.012359343294595093, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.00023742977191432146, "learning_rate": 1.2165898617511523e-06, "loss": 0.0, "num_tokens": 607598.0, "reward": 0.800000011920929, "reward_std": 0.7406560778617859, "rewards/fixed_code_pass_all_test_reward/mean": 0.17499999701976776, "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 633.375, "completions/mean_terminated_length": 633.375, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.012543811104962183, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.00013511399265553337, "learning_rate": 1.2350230414746545e-06, "loss": 0.0, "num_tokens": 617577.0, "reward": 0.9166667461395264, "reward_std": 0.8309490084648132, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4154745042324066, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.012728278915329275, "frac_reward_zero_std": 1.0, "grad_norm": 0.004486083984375, "kl": 0.00023202788543130737, "learning_rate": 1.2534562211981569e-06, "loss": 0.0, "num_tokens": 629769.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 616.5, "completions/mean_terminated_length": 616.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.012912746725696367, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.00015789896042406326, "learning_rate": 1.271889400921659e-06, "loss": 0.0, "num_tokens": 641781.0, "reward": 0.7708333730697632, "reward_std": 0.46876654028892517, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.03857583925127983, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 423.625, "completions/mean_terminated_length": 191.57144165039062, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.013097214536063457, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.00016825743887238787, "learning_rate": 1.2903225806451614e-06, "loss": 0.0, "num_tokens": 648842.0, "reward": 1.25, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.013281682346430549, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.00022476712092611706, "learning_rate": 1.3087557603686638e-06, "loss": 0.0, "num_tokens": 655355.0, "reward": 0.7000000476837158, "reward_std": 0.4820590913295746, "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.29113897681236267, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 652.0, "completions/mean_terminated_length": 452.5714416503906, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.013466150156797639, "frac_reward_zero_std": 0.0, "grad_norm": 4.75, "kl": 0.0002261804784211563, "learning_rate": 1.3271889400921662e-06, "loss": 0.0, "num_tokens": 666403.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.01365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.00017740407565725036, "learning_rate": 1.3456221198156683e-06, "loss": 0.0, "num_tokens": 670388.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.01383508577753182, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.00030659233379992656, "learning_rate": 1.3640552995391707e-06, "loss": 0.0, "num_tokens": 675493.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 555.0, "completions/mean_terminated_length": 555.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.014019553587898912, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.00023349323964794166, "learning_rate": 1.382488479262673e-06, "loss": 0.0, "num_tokens": 688861.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1309.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 938.125, "completions/mean_terminated_length": 938.125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.014204021398266002, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.0003092967308475636, "learning_rate": 1.400921658986175e-06, "loss": 0.0, "num_tokens": 703574.0, "reward": 0.3822115361690521, "reward_std": 0.5278024077415466, "rewards/fixed_code_pass_all_test_reward/mean": 0.0072115384973585606, "rewards/fixed_code_pass_all_test_reward/std": 0.020397311076521873, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 387.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.014388489208633094, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.00021497006855497602, "learning_rate": 1.4193548387096776e-06, "loss": 0.0, "num_tokens": 715083.0, "reward": 1.25, "reward_std": 0.6306403279304504, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.40689605474472046, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 612.375, "completions/mean_terminated_length": 407.2857360839844, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.014572957019000184, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0001775709733919939, "learning_rate": 1.43778801843318e-06, "loss": 0.0, "num_tokens": 726710.0, "reward": 0.9615384340286255, "reward_std": 0.649468183517456, "rewards/fixed_code_pass_all_test_reward/mean": 0.21153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.3295787572860718, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 399.125, "completions/mean_terminated_length": 399.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.014757424829367276, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.00019801928829110693, "learning_rate": 1.456221198156682e-06, "loss": 0.0, "num_tokens": 734071.0, "reward": 0.38749998807907104, "reward_std": 0.5185625553131104, "rewards/fixed_code_pass_all_test_reward/mean": 0.01249999925494194, "rewards/fixed_code_pass_all_test_reward/std": 0.018322506919503212, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 540.625, "completions/mean_terminated_length": 540.625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.014941892639734366, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.000200824973944691, "learning_rate": 1.4746543778801844e-06, "loss": 0.0, "num_tokens": 743548.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 781.0, "completions/mean_terminated_length": 781.0, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.015126360450101458, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.00017313397802354302, "learning_rate": 1.4930875576036868e-06, "loss": 0.0, "num_tokens": 755860.0, "reward": 0.8035714626312256, "reward_std": 0.5286955237388611, "rewards/fixed_code_pass_all_test_reward/mean": 0.1785714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 837.875, "completions/mean_terminated_length": 665.0, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.015310828260468548, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0002544065773690818, "learning_rate": 1.511520737327189e-06, "loss": 0.0, "num_tokens": 770435.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.01549529607083564, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.000220081898078206, "learning_rate": 1.5299539170506913e-06, "loss": 0.0, "num_tokens": 780955.0, "reward": 1.170454502105713, "reward_std": 0.5152629017829895, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.2368127703666687, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 445.625, "completions/mean_terminated_length": 445.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.01567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.00019480635637592059, "learning_rate": 1.5483870967741937e-06, "loss": 0.0, "num_tokens": 793088.0, "reward": 1.1689815521240234, "reward_std": 0.17608249187469482, "rewards/fixed_code_pass_all_test_reward/mean": 0.16898147761821747, "rewards/fixed_code_pass_all_test_reward/std": 0.17608249187469482, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.01586423169156982, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.00016720175563023076, "learning_rate": 1.5668202764976959e-06, "loss": 0.0, "num_tokens": 798829.0, "reward": 1.40625, "reward_std": 0.9057110548019409, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.016048699501936912, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.0005037817518314114, "learning_rate": 1.5852534562211982e-06, "loss": 0.0, "num_tokens": 802827.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.016233167312304002, "frac_reward_zero_std": 1.0, "grad_norm": 0.005035400390625, "kl": 0.00017259119431400904, "learning_rate": 1.6036866359447006e-06, "loss": 0.0, "num_tokens": 809255.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 489.625, "completions/mean_terminated_length": 489.625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.016417635122671095, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.00023493254229833838, "learning_rate": 1.622119815668203e-06, "loss": 0.0, "num_tokens": 818868.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 640.5, "completions/mean_terminated_length": 640.5, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.016602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0002694915783649776, "learning_rate": 1.6405529953917052e-06, "loss": 0.0, "num_tokens": 832872.0, "reward": 0.8684210181236267, "reward_std": 0.5631824731826782, "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.18766893446445465, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.016786570743405275, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.00021502541858353652, "learning_rate": 1.6589861751152075e-06, "loss": 0.0, "num_tokens": 843376.0, "reward": 1.5104167461395264, "reward_std": 0.4552112817764282, "rewards/fixed_code_pass_all_test_reward/mean": 0.6354166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4317220449447632, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 618.75, "completions/mean_terminated_length": 618.75, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.016971038553772366, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.00017798052704165457, "learning_rate": 1.67741935483871e-06, "loss": 0.0, "num_tokens": 857310.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.01715550636413946, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.00022852111760585103, "learning_rate": 1.695852534562212e-06, "loss": 0.0, "num_tokens": 862846.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 493.0, "completions/mean_terminated_length": 493.0, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.01733997417450655, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0002933645519078709, "learning_rate": 1.7142857142857145e-06, "loss": 0.0, "num_tokens": 871038.0, "reward": 0.7307692170143127, "reward_std": 0.6435846090316772, "rewards/fixed_code_pass_all_test_reward/mean": 0.23076924681663513, "rewards/fixed_code_pass_all_test_reward/std": 0.358450710773468, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 456.625, "completions/mean_terminated_length": 456.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.01752444198487364, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0002652740322446334, "learning_rate": 1.7327188940092169e-06, "loss": 0.0, "num_tokens": 879763.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.01770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.00019635578155430267, "learning_rate": 1.751152073732719e-06, "loss": 0.0, "num_tokens": 889212.0, "reward": 1.0999999046325684, "reward_std": 0.45669618248939514, "rewards/fixed_code_pass_all_test_reward/mean": 0.22500000894069672, "rewards/fixed_code_pass_all_test_reward/std": 0.13887302577495575, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 749.25, "completions/mean_terminated_length": 563.7142944335938, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.017893377605607823, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.00024798521008051466, "learning_rate": 1.7695852534562214e-06, "loss": 0.0, "num_tokens": 903390.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.018077845415974913, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.00027365733876649756, "learning_rate": 1.7880184331797238e-06, "loss": 0.0, "num_tokens": 909402.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 477.625, "completions/mean_terminated_length": 477.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.018262313226342003, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0002560757529863622, "learning_rate": 1.8064516129032258e-06, "loss": 0.0, "num_tokens": 918223.0, "reward": 1.1370967626571655, "reward_std": 0.7183694839477539, "rewards/fixed_code_pass_all_test_reward/mean": 0.5120967626571655, "rewards/fixed_code_pass_all_test_reward/std": 0.35017356276512146, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.018446781036709093, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0003022496966877952, "learning_rate": 1.8248847926267283e-06, "loss": 0.0, "num_tokens": 923452.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.018631248847076187, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.000247218271397287, "learning_rate": 1.8433179723502307e-06, "loss": 0.0, "num_tokens": 931719.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 404.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.018815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0002791819078993285, "learning_rate": 1.8617511520737327e-06, "loss": 0.0, "num_tokens": 941634.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 460.25, "completions/mean_terminated_length": 460.25, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.019000184467810367, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.00025024179558386095, "learning_rate": 1.880184331797235e-06, "loss": 0.0, "num_tokens": 949820.0, "reward": 0.90625, "reward_std": 0.6179162263870239, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.27973026037216187, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 300.75, "completions/mean_terminated_length": 300.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.019184652278177457, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.00030522290489898296, "learning_rate": 1.8986175115207374e-06, "loss": 0.0, "num_tokens": 955074.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.01936912008854455, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.0002908794067479903, "learning_rate": 1.91705069124424e-06, "loss": 0.0, "num_tokens": 960924.0, "reward": 1.3125, "reward_std": 0.6868232488632202, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.47087812423706055, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 577.625, "completions/mean_terminated_length": 577.625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.01955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.00024830274196574464, "learning_rate": 1.935483870967742e-06, "loss": 0.0, "num_tokens": 968777.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 659.875, "completions/mean_terminated_length": 659.875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.01973805570927873, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.0002435520655126311, "learning_rate": 1.9539170506912444e-06, "loss": 0.0, "num_tokens": 981424.0, "reward": 1.3068182468414307, "reward_std": 0.4262283742427826, "rewards/fixed_code_pass_all_test_reward/mean": 0.5568181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.37776920199394226, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 424.75, "completions/mean_terminated_length": 424.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.01992252351964582, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.00024267719345516525, "learning_rate": 1.9723502304147468e-06, "loss": 0.0, "num_tokens": 991270.0, "reward": 1.2890625, "reward_std": 0.7052794694900513, "rewards/fixed_code_pass_all_test_reward/mean": 0.5390625, "rewards/fixed_code_pass_all_test_reward/std": 0.286776065826416, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.020106991330012914, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.00026080430689034984, "learning_rate": 1.990783410138249e-06, "loss": 0.0, "num_tokens": 1002474.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 377.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.020291459140380004, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0002325564810234937, "learning_rate": 2.0092165898617515e-06, "loss": 0.0, "num_tokens": 1009785.0, "reward": 1.4285714626312256, "reward_std": 0.7284314036369324, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.38132426142692566, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 449.625, "completions/mean_terminated_length": 449.625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.020475926950747094, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.00016853617398737697, "learning_rate": 2.027649769585254e-06, "loss": 0.0, "num_tokens": 1019646.0, "reward": 1.2374999523162842, "reward_std": 0.61105877161026, "rewards/fixed_code_pass_all_test_reward/mean": 0.36250001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.38055410981178284, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/max_terminated_length": 1135.0, "completions/mean_length": 464.875, "completions/mean_terminated_length": 464.875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.020660394761114184, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.000360059981176164, "learning_rate": 2.046082949308756e-06, "loss": 0.0, "num_tokens": 1029933.0, "reward": 0.836538553237915, "reward_std": 0.6231482028961182, "rewards/fixed_code_pass_all_test_reward/mean": 0.21153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.16446846723556519, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 616.625, "completions/mean_terminated_length": 616.625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.020844862571481278, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.00024101105464069406, "learning_rate": 2.0645161290322582e-06, "loss": 0.0, "num_tokens": 1045258.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 512.5, "completions/mean_terminated_length": 512.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.021029330381848368, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0003414558359509101, "learning_rate": 2.0829493087557606e-06, "loss": 0.0, "num_tokens": 1053870.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 575.75, "completions/mean_terminated_length": 575.75, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.021213798192215458, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.00022292660651146434, "learning_rate": 2.1013824884792626e-06, "loss": 0.0, "num_tokens": 1065236.0, "reward": 0.9375, "reward_std": 0.6232117414474487, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 486.375, "completions/mean_terminated_length": 486.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.021398266002582548, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.00018211517999588978, "learning_rate": 2.119815668202765e-06, "loss": 0.0, "num_tokens": 1076815.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.02158273381294964, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.0004432215409906348, "learning_rate": 2.1382488479262673e-06, "loss": 0.0, "num_tokens": 1082892.0, "reward": 1.7083333730697632, "reward_std": 0.7000566720962524, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.35634833574295044, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.02176720162331673, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.000386988684113021, "learning_rate": 2.1566820276497697e-06, "loss": 0.0, "num_tokens": 1091219.0, "reward": 0.512499988079071, "reward_std": 0.49407199025154114, "rewards/fixed_code_pass_all_test_reward/mean": 0.26249998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.1922609806060791, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.02195166943368382, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.00035162228959961794, "learning_rate": 2.175115207373272e-06, "loss": 0.0, "num_tokens": 1096573.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 591.875, "completions/mean_terminated_length": 591.875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.02213613724405091, "frac_reward_zero_std": 0.0, "grad_norm": 3.109375, "kl": 0.00040626297595736105, "learning_rate": 2.1935483870967745e-06, "loss": 0.0, "num_tokens": 1107100.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.022320605054418005, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.00030954191788623575, "learning_rate": 2.211981566820277e-06, "loss": 0.0, "num_tokens": 1114083.0, "reward": 0.980555534362793, "reward_std": 0.4398873448371887, "rewards/fixed_code_pass_all_test_reward/mean": 0.10555555671453476, "rewards/fixed_code_pass_all_test_reward/std": 0.19581152498722076, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 519.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.022505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.00022163750509207603, "learning_rate": 2.230414746543779e-06, "loss": 0.0, "num_tokens": 1124988.0, "reward": 0.9471153616905212, "reward_std": 0.38622599840164185, "rewards/fixed_code_pass_all_test_reward/mean": 0.07211538404226303, "rewards/fixed_code_pass_all_test_reward/std": 0.05971721187233925, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 523.125, "completions/mean_terminated_length": 523.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.022689540675152185, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.00034853070246754214, "learning_rate": 2.248847926267281e-06, "loss": 0.0, "num_tokens": 1137429.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 460.0, "completions/mean_terminated_length": 460.0, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.022874008485519275, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0002755507011897862, "learning_rate": 2.2672811059907836e-06, "loss": 0.0, "num_tokens": 1149077.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.02305847629588637, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0004555258601612877, "learning_rate": 2.285714285714286e-06, "loss": 0.0, "num_tokens": 1153878.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 447.25, "completions/mean_terminated_length": 447.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.02324294410625346, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.0002706130308069987, "learning_rate": 2.3041474654377884e-06, "loss": 0.0, "num_tokens": 1165792.0, "reward": 1.860576868057251, "reward_std": 0.35006189346313477, "rewards/fixed_code_pass_all_test_reward/mean": 0.9855769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.04079463332891464, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 728.5, "completions/mean_terminated_length": 540.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.02342741191662055, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.00026130559672310483, "learning_rate": 2.3225806451612907e-06, "loss": 0.0, "num_tokens": 1179604.0, "reward": 0.875, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 435.5, "completions/mean_terminated_length": 435.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.02361187972698764, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.000289637715468416, "learning_rate": 2.3410138248847927e-06, "loss": 0.0, "num_tokens": 1187752.0, "reward": 0.942307710647583, "reward_std": 0.4461728036403656, "rewards/fixed_code_pass_all_test_reward/mean": 0.19230769574642181, "rewards/fixed_code_pass_all_test_reward/std": 0.08223423361778259, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 515.5, "completions/mean_terminated_length": 515.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.023796347537354733, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.00040351709321839735, "learning_rate": 2.359447004608295e-06, "loss": 0.0, "num_tokens": 1197380.0, "reward": 0.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 523.5, "completions/mean_terminated_length": 523.5, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.023980815347721823, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0003619024882937083, "learning_rate": 2.3778801843317975e-06, "loss": 0.0, "num_tokens": 1208256.0, "reward": 1.0, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.024165283158088913, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0002477585449014441, "learning_rate": 2.3963133640553e-06, "loss": 0.0, "num_tokens": 1216512.0, "reward": 1.7357953786849976, "reward_std": 0.2032562494277954, "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.20325623452663422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.024349750968456003, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.0007874341190472478, "learning_rate": 2.4147465437788022e-06, "loss": 0.0, "num_tokens": 1222186.0, "reward": 0.875, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 663.125, "completions/mean_terminated_length": 663.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.024534218778823096, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.00027552976871447754, "learning_rate": 2.4331797235023046e-06, "loss": 0.0, "num_tokens": 1233787.0, "reward": 1.242347002029419, "reward_std": 0.8792401552200317, "rewards/fixed_code_pass_all_test_reward/mean": 0.49234694242477417, "rewards/fixed_code_pass_all_test_reward/std": 0.5267224311828613, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 700.0, "completions/mean_terminated_length": 700.0, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.024718686589190186, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0003037431961274706, "learning_rate": 2.4516129032258066e-06, "loss": 0.0, "num_tokens": 1248771.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 756.75, "completions/mean_terminated_length": 756.75, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.024903154399557276, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.0002866925542548415, "learning_rate": 2.470046082949309e-06, "loss": 0.0, "num_tokens": 1265217.0, "reward": 0.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 585.25, "completions/mean_terminated_length": 585.25, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.025087622209924366, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.00031472905902774073, "learning_rate": 2.4884792626728113e-06, "loss": 0.0, "num_tokens": 1276923.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 489.75, "completions/mean_terminated_length": 489.75, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.02527209002029146, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.00031833308094064705, "learning_rate": 2.5069124423963137e-06, "loss": 0.0, "num_tokens": 1286377.0, "reward": 0.5625, "reward_std": 0.5469068884849548, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 505.25, "completions/mean_terminated_length": 505.25, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.02545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0002574691297922982, "learning_rate": 2.5253456221198157e-06, "loss": 0.0, "num_tokens": 1297707.0, "reward": 1.4267241954803467, "reward_std": 0.5558459162712097, "rewards/fixed_code_pass_all_test_reward/mean": 0.6767241954803467, "rewards/fixed_code_pass_all_test_reward/std": 0.229797825217247, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.02564102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.00028419451518857386, "learning_rate": 2.543778801843318e-06, "loss": 0.0, "num_tokens": 1304571.0, "reward": 1.8888888359069824, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 566.875, "completions/mean_terminated_length": 566.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.025825493451392734, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0004287885094527155, "learning_rate": 2.5622119815668204e-06, "loss": 0.0, "num_tokens": 1315930.0, "reward": 1.4293478727340698, "reward_std": 0.1879972666501999, "rewards/fixed_code_pass_all_test_reward/mean": 0.42934781312942505, "rewards/fixed_code_pass_all_test_reward/std": 0.1879972517490387, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.026009961261759824, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.00028534480225062, "learning_rate": 2.580645161290323e-06, "loss": 0.0, "num_tokens": 1320547.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 434.0, "completions/mean_terminated_length": 434.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.026194429072126914, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0003331211682962021, "learning_rate": 2.5990783410138248e-06, "loss": 0.0, "num_tokens": 1328979.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 364.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.026378896882494004, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.0006832111193944002, "learning_rate": 2.6175115207373276e-06, "loss": 0.0, "num_tokens": 1337539.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.026563364692861097, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.00036721180731547065, "learning_rate": 2.6359447004608295e-06, "loss": 0.0, "num_tokens": 1342483.0, "reward": 1.1190476417541504, "reward_std": 0.1272672861814499, "rewards/fixed_code_pass_all_test_reward/mean": 0.1190476194024086, "rewards/fixed_code_pass_all_test_reward/std": 0.1272672712802887, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.026747832503228187, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0006242797389859334, "learning_rate": 2.6543778801843323e-06, "loss": 0.0, "num_tokens": 1346807.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 633.625, "completions/mean_terminated_length": 431.5714416503906, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.026932300313595278, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.0005374398151616333, "learning_rate": 2.6728110599078343e-06, "loss": 0.0, "num_tokens": 1360412.0, "reward": 1.1335227489471436, "reward_std": 0.8332343697547913, "rewards/fixed_code_pass_all_test_reward/mean": 0.38352271914482117, "rewards/fixed_code_pass_all_test_reward/std": 0.5107228755950928, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 395.0, "completions/mean_terminated_length": 395.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.027116768123962368, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.0002991663513967069, "learning_rate": 2.6912442396313367e-06, "loss": 0.0, "num_tokens": 1369804.0, "reward": 1.6749999523162842, "reward_std": 0.36936238408088684, "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.36936238408088684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.02730123593432946, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0004839637695113197, "learning_rate": 2.709677419354839e-06, "loss": 0.0, "num_tokens": 1374438.0, "reward": 0.75, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 129.375, "completions/mean_terminated_length": 129.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.02748570374469655, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.00085990996012697, "learning_rate": 2.7281105990783414e-06, "loss": 0.0, "num_tokens": 1378289.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 465.25, "completions/mean_terminated_length": 465.25, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.02767017155506364, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0007587236941617448, "learning_rate": 2.7465437788018434e-06, "loss": 0.0, "num_tokens": 1386859.0, "reward": 0.40625, "reward_std": 0.4988826811313629, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 387.625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.02785463936543073, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.0003651740862551378, "learning_rate": 2.764976958525346e-06, "loss": 0.0, "num_tokens": 1397880.0, "reward": 0.9038461446762085, "reward_std": 0.36948251724243164, "rewards/fixed_code_pass_all_test_reward/mean": 0.02884615585207939, "rewards/fixed_code_pass_all_test_reward/std": 0.057232603430747986, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 480.0, "completions/mean_terminated_length": 480.0, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.028039107175797825, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.00037873095425311476, "learning_rate": 2.783410138248848e-06, "loss": 0.0, "num_tokens": 1410232.0, "reward": 1.2727272510528564, "reward_std": 0.5233621001243591, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772725105285645, "rewards/fixed_code_pass_all_test_reward/std": 0.43716782331466675, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.028223574986164915, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.0011528825780260377, "learning_rate": 2.80184331797235e-06, "loss": 0.0, "num_tokens": 1414798.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.028408042796532005, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0006661412990069948, "learning_rate": 2.820276497695853e-06, "loss": 0.0, "num_tokens": 1423210.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 768.5, "completions/mean_terminated_length": 768.5, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.028592510606899095, "frac_reward_zero_std": 0.0, "grad_norm": 0.71875, "kl": 0.00028987093992327573, "learning_rate": 2.8387096774193553e-06, "loss": 0.0, "num_tokens": 1436398.0, "reward": 1.3125, "reward_std": 0.39426735043525696, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3005340099334717, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 865.375, "completions/mean_terminated_length": 696.4285888671875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.02877697841726619, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.00040699875353311654, "learning_rate": 2.8571428571428573e-06, "loss": 0.0, "num_tokens": 1451065.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 554.75, "completions/mean_terminated_length": 554.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.02896144622763328, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.0004439274580363417, "learning_rate": 2.87557603686636e-06, "loss": 0.0, "num_tokens": 1461271.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 483.875, "completions/mean_terminated_length": 483.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.02914591403800037, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.0003580423963285284, "learning_rate": 2.894009216589862e-06, "loss": 0.0, "num_tokens": 1471838.0, "reward": 1.1781609058380127, "reward_std": 0.3656545579433441, "rewards/fixed_code_pass_all_test_reward/mean": 0.3031609356403351, "rewards/fixed_code_pass_all_test_reward/std": 0.36817196011543274, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 600.5, "completions/mean_terminated_length": 393.71429443359375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.02933038184836746, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.0002662021815922344, "learning_rate": 2.912442396313364e-06, "loss": 0.0, "num_tokens": 1482482.0, "reward": 1.2857143878936768, "reward_std": 0.6325119733810425, "rewards/fixed_code_pass_all_test_reward/mean": 0.4107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.39714542031288147, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 664.125, "completions/mean_terminated_length": 466.4285888671875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.029514849658734552, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0005041680597059894, "learning_rate": 2.9308755760368668e-06, "loss": 0.0, "num_tokens": 1494515.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 604.5, "completions/mean_terminated_length": 604.5, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.029699317469101642, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0005147759948158637, "learning_rate": 2.9493087557603687e-06, "loss": 0.0, "num_tokens": 1505223.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.029883785279468732, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0009588434695615433, "learning_rate": 2.967741935483871e-06, "loss": 0.0, "num_tokens": 1511101.0, "reward": 0.511363685131073, "reward_std": 0.5118144154548645, "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, "rewards/fixed_code_pass_all_test_reward/std": 0.08416546881198883, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.030068253089835822, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.0004337240316090174, "learning_rate": 2.9861751152073735e-06, "loss": 0.0, "num_tokens": 1517459.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.030252720900202916, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.000409138987379265, "learning_rate": 3.004608294930876e-06, "loss": 0.0, "num_tokens": 1524921.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 525.5, "completions/mean_terminated_length": 525.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.030437188710570006, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.00037674761551897973, "learning_rate": 3.023041474654378e-06, "loss": 0.0, "num_tokens": 1536117.0, "reward": 1.2666666507720947, "reward_std": 0.881557047367096, "rewards/fixed_code_pass_all_test_reward/mean": 0.5166666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.5173191428184509, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 586.75, "completions/mean_terminated_length": 586.75, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.030621656520937096, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.0005971487025817623, "learning_rate": 3.0414746543778806e-06, "loss": 0.0, "num_tokens": 1546355.0, "reward": 1.03125, "reward_std": 0.5597928762435913, "rewards/fixed_code_pass_all_test_reward/mean": 0.28125, "rewards/fixed_code_pass_all_test_reward/std": 0.1833198070526123, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 508.0, "completions/mean_terminated_length": 508.0, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.030806124331304186, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0007573354814667255, "learning_rate": 3.0599078341013826e-06, "loss": 0.0, "num_tokens": 1556139.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.03099059214167128, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0004650029350159457, "learning_rate": 3.078341013824885e-06, "loss": 0.0, "num_tokens": 1560921.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 941.875, "completions/mean_terminated_length": 783.857177734375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.03117505995203837, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.00032446978366351686, "learning_rate": 3.0967741935483874e-06, "loss": 0.0, "num_tokens": 1575536.0, "reward": 0.9453125, "reward_std": 0.6599068641662598, "rewards/fixed_code_pass_all_test_reward/mean": 0.1953125, "rewards/fixed_code_pass_all_test_reward/std": 0.33103513717651367, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.03135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0011351883367751725, "learning_rate": 3.1152073732718897e-06, "loss": 0.0, "num_tokens": 1581752.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 447.375, "completions/mean_terminated_length": 447.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.03154399557277255, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0008446653191640507, "learning_rate": 3.1336405529953917e-06, "loss": 0.0, "num_tokens": 1591875.0, "reward": 1.4375, "reward_std": 0.6603119969367981, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.38768237829208374, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.03172846338313964, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0004901043084828416, "learning_rate": 3.1520737327188945e-06, "loss": 0.0, "num_tokens": 1596970.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 386.5, "completions/mean_terminated_length": 386.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.03191293119350673, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.002058852749541984, "learning_rate": 3.1705069124423965e-06, "loss": 0.0001, "num_tokens": 1603902.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 927.75, "completions/mean_terminated_length": 554.3333740234375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.032097399003873824, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0007004580547800288, "learning_rate": 3.1889400921658984e-06, "loss": 0.0, "num_tokens": 1623460.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 477.125, "completions/mean_terminated_length": 477.125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.03228186681424092, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0004310562726459466, "learning_rate": 3.2073732718894012e-06, "loss": 0.0, "num_tokens": 1635029.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 693.0, "completions/mean_terminated_length": 693.0, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.032466334624608004, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0006872646517877001, "learning_rate": 3.225806451612903e-06, "loss": 0.0, "num_tokens": 1645349.0, "reward": 1.1875, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 507.625, "completions/mean_terminated_length": 507.625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.0326508024349751, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.0011440341331763193, "learning_rate": 3.244239631336406e-06, "loss": 0.0, "num_tokens": 1653970.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.03283527024534219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0096435546875, "kl": 0.0006141647609183565, "learning_rate": 3.2626728110599084e-06, "loss": 0.0, "num_tokens": 1662603.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 504.75, "completions/mean_terminated_length": 504.75, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.03301973805570928, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0007241026942210738, "learning_rate": 3.2811059907834103e-06, "loss": 0.0, "num_tokens": 1674569.0, "reward": 1.0277777910232544, "reward_std": 0.4474107325077057, "rewards/fixed_code_pass_all_test_reward/mean": 0.2777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.13280318677425385, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.03320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0005435090661194408, "learning_rate": 3.299539170506913e-06, "loss": 0.0, "num_tokens": 1680073.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 429.125, "completions/mean_terminated_length": 429.125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.03338867367644346, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.0008517967544321436, "learning_rate": 3.317972350230415e-06, "loss": 0.0, "num_tokens": 1688330.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 426.125, "completions/mean_terminated_length": 426.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.03357314148681055, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0007898740223026834, "learning_rate": 3.336405529953917e-06, "loss": 0.0, "num_tokens": 1698747.0, "reward": 0.9451218843460083, "reward_std": 0.39127692580223083, "rewards/fixed_code_pass_all_test_reward/mean": 0.07012195140123367, "rewards/fixed_code_pass_all_test_reward/std": 0.0897931158542633, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 441.25, "completions/mean_terminated_length": 441.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.033757609297177645, "frac_reward_zero_std": 0.0, "grad_norm": 37.25, "kl": 0.004780470324476482, "learning_rate": 3.35483870967742e-06, "loss": 0.0002, "num_tokens": 1710133.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.03394207710754473, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0006970599461055826, "learning_rate": 3.373271889400922e-06, "loss": 0.0, "num_tokens": 1715054.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 397.75, "completions/mean_terminated_length": 397.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.034126544917911825, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.00037298203278623987, "learning_rate": 3.391705069124424e-06, "loss": 0.0, "num_tokens": 1722196.0, "reward": 1.28125, "reward_std": 0.6967719197273254, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.4759858250617981, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 414.5, "completions/mean_terminated_length": 414.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.03431101272827892, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.0009564059146214277, "learning_rate": 3.4101382488479266e-06, "loss": 0.0, "num_tokens": 1730192.0, "reward": 0.8088235259056091, "reward_std": 0.4387890696525574, "rewards/fixed_code_pass_all_test_reward/mean": 0.05882352963089943, "rewards/fixed_code_pass_all_test_reward/std": 0.10892001539468765, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 556.75, "completions/mean_terminated_length": 556.75, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.034495480538646005, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.0005112210856168531, "learning_rate": 3.428571428571429e-06, "loss": 0.0, "num_tokens": 1739518.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.0346799483490131, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.000551451896171784, "learning_rate": 3.447004608294931e-06, "loss": 0.0, "num_tokens": 1748572.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 404.375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.034864416159380185, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.0009702303104859311, "learning_rate": 3.4654377880184337e-06, "loss": 0.0, "num_tokens": 1758527.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 668.375, "completions/mean_terminated_length": 668.375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.03504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.0005567252701439429, "learning_rate": 3.4838709677419357e-06, "loss": 0.0, "num_tokens": 1772154.0, "reward": 0.8125, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 390.375, "completions/mean_terminated_length": 390.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.03523335178011437, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.000561461471079383, "learning_rate": 3.502304147465438e-06, "loss": 0.0, "num_tokens": 1778469.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 478.75, "completions/mean_terminated_length": 478.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.03541781959048146, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0013180634887248743, "learning_rate": 3.5207373271889404e-06, "loss": 0.0001, "num_tokens": 1789611.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 536.875, "completions/mean_terminated_length": 536.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.03560228740084855, "frac_reward_zero_std": 1.0, "grad_norm": 0.00701904296875, "kl": 0.0004805325843335595, "learning_rate": 3.539170506912443e-06, "loss": 0.0, "num_tokens": 1799818.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.035786755211215646, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0009351367934868904, "learning_rate": 3.5576036866359448e-06, "loss": 0.0, "num_tokens": 1804221.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.03597122302158273, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0007711461621511262, "learning_rate": 3.5760368663594476e-06, "loss": 0.0, "num_tokens": 1811261.0, "reward": 1.3306450843811035, "reward_std": 0.244304820895195, "rewards/fixed_code_pass_all_test_reward/mean": 0.3306451439857483, "rewards/fixed_code_pass_all_test_reward/std": 0.2443048357963562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 444.0, "completions/mean_terminated_length": 444.0, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.036155690831949826, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0012634655431611463, "learning_rate": 3.5944700460829495e-06, "loss": 0.0001, "num_tokens": 1819717.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.03634015864231691, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0018502646016713697, "learning_rate": 3.6129032258064515e-06, "loss": 0.0001, "num_tokens": 1825000.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 462.75, "completions/mean_terminated_length": 462.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.036524626452684006, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.0008934969082474709, "learning_rate": 3.6313364055299543e-06, "loss": 0.0, "num_tokens": 1834214.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 469.375, "completions/mean_terminated_length": 469.375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.0367090942630511, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.0009618061376386322, "learning_rate": 3.6497695852534567e-06, "loss": 0.0, "num_tokens": 1842385.0, "reward": 1.5367646217346191, "reward_std": 0.3228602707386017, "rewards/fixed_code_pass_all_test_reward/mean": 0.6617646813392639, "rewards/fixed_code_pass_all_test_reward/std": 0.20558202266693115, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 394.875, "completions/mean_terminated_length": 394.875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.036893562073418186, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "kl": 0.0004899756622762652, "learning_rate": 3.6682027649769586e-06, "loss": 0.0, "num_tokens": 1849768.0, "reward": 1.8571429252624512, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 989.625, "completions/mean_terminated_length": 838.4285888671875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.03707802988378528, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.0006023468413332012, "learning_rate": 3.6866359447004615e-06, "loss": 0.0, "num_tokens": 1867349.0, "reward": 0.6160714626312256, "reward_std": 0.6958723664283752, "rewards/fixed_code_pass_all_test_reward/mean": 0.2410714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.3705395758152008, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 404.375, "completions/mean_terminated_length": 404.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.03726249769415237, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0007531879418820608, "learning_rate": 3.7050691244239634e-06, "loss": 0.0, "num_tokens": 1876640.0, "reward": 1.6812500953674316, "reward_std": 0.3837665617465973, "rewards/fixed_code_pass_all_test_reward/mean": 0.8062499761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2786286771297455, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.03744696550451946, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.0011236622594879009, "learning_rate": 3.7235023041474654e-06, "loss": 0.0, "num_tokens": 1881537.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 644.625, "completions/mean_terminated_length": 644.625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.03763143331488655, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.0008307326352223754, "learning_rate": 3.741935483870968e-06, "loss": 0.0, "num_tokens": 1893966.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 417.875, "completions/mean_terminated_length": 417.875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.03781590112525364, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0008440261881332844, "learning_rate": 3.76036866359447e-06, "loss": 0.0, "num_tokens": 1902429.0, "reward": 1.0119047164916992, "reward_std": 0.03367177024483681, "rewards/fixed_code_pass_all_test_reward/mean": 0.011904762126505375, "rewards/fixed_code_pass_all_test_reward/std": 0.033671751618385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 661.375, "completions/mean_terminated_length": 661.375, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.03800036893562073, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.0005015727947466075, "learning_rate": 3.7788018433179725e-06, "loss": 0.0, "num_tokens": 1914760.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.03818483674598783, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0009364754696434829, "learning_rate": 3.797235023041475e-06, "loss": 0.0, "num_tokens": 1920421.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 403.625, "completions/mean_terminated_length": 403.625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.03836930455635491, "frac_reward_zero_std": 1.0, "grad_norm": 0.0128173828125, "kl": 0.001008490078675095, "learning_rate": 3.815668202764977e-06, "loss": 0.0, "num_tokens": 1929066.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 837.125, "completions/mean_terminated_length": 664.1428833007812, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.03855377236672201, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0006416336327674799, "learning_rate": 3.83410138248848e-06, "loss": 0.0, "num_tokens": 1939659.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 589.0, "completions/mean_terminated_length": 589.0, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.0387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.00039901621858007275, "learning_rate": 3.852534562211982e-06, "loss": 0.0, "num_tokens": 1950787.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.03892270798745619, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.002401056233793497, "learning_rate": 3.870967741935484e-06, "loss": 0.0001, "num_tokens": 1956708.0, "reward": 1.149999976158142, "reward_std": 0.6047431826591492, "rewards/fixed_code_pass_all_test_reward/mean": 0.5249999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.13887302577495575, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.03910717579782328, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0014002888492541388, "learning_rate": 3.889400921658986e-06, "loss": 0.0001, "num_tokens": 1963670.0, "reward": 0.9711538553237915, "reward_std": 0.8324946761131287, "rewards/fixed_code_pass_all_test_reward/mean": 0.3461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.41526252031326294, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.039291643608190374, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0014920804896974005, "learning_rate": 3.907834101382489e-06, "loss": 0.0001, "num_tokens": 1970916.0, "reward": 1.1745688915252686, "reward_std": 0.06413999944925308, "rewards/fixed_code_pass_all_test_reward/mean": 0.17456898093223572, "rewards/fixed_code_pass_all_test_reward/std": 0.06413999199867249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 459.0, "completions/mean_terminated_length": 459.0, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.03947611141855746, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.0003846293820970459, "learning_rate": 3.926267281105991e-06, "loss": 0.0, "num_tokens": 1978796.0, "reward": 1.774999976158142, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 887.125, "completions/mean_terminated_length": 887.125, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.039660579228924554, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.0005514328586286865, "learning_rate": 3.9447004608294935e-06, "loss": 0.0, "num_tokens": 1996381.0, "reward": 0.8274999856948853, "reward_std": 0.5537340641021729, "rewards/fixed_code_pass_all_test_reward/mean": 0.07750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.21920311450958252, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.03984504703929164, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0032268812356051058, "learning_rate": 3.963133640552996e-06, "loss": 0.0001, "num_tokens": 2001982.0, "reward": 1.15625, "reward_std": 0.6365013122558594, "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, "rewards/fixed_code_pass_all_test_reward/std": 0.4943881630897522, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 448.375, "completions/mean_terminated_length": 219.85714721679688, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.040029514849658734, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0007804899214534089, "learning_rate": 3.981566820276498e-06, "loss": 0.0, "num_tokens": 2008481.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 426.25, "completions/mean_terminated_length": 426.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.04021398266002583, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0016736947472963948, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "num_tokens": 2019787.0, "reward": 1.685049057006836, "reward_std": 0.45926016569137573, "rewards/fixed_code_pass_all_test_reward/mean": 0.8100490570068359, "rewards/fixed_code_pass_all_test_reward/std": 0.37442171573638916, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 494.875, "completions/mean_terminated_length": 494.875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.040398450470392915, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.0005830442496517207, "learning_rate": 4.018433179723503e-06, "loss": 0.0, "num_tokens": 2030442.0, "reward": 1.6057692766189575, "reward_std": 0.33765512704849243, "rewards/fixed_code_pass_all_test_reward/mean": 0.6057692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.3376551568508148, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 641.375, "completions/mean_terminated_length": 641.375, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.04058291828076001, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0011189756587555166, "learning_rate": 4.036866359447005e-06, "loss": 0.0, "num_tokens": 2041941.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 350.75, "completions/mean_terminated_length": 350.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0407673860911271, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0011060474753321614, "learning_rate": 4.055299539170508e-06, "loss": 0.0, "num_tokens": 2050891.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.04095185390149419, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0011242312903050333, "learning_rate": 4.073732718894009e-06, "loss": 0.0, "num_tokens": 2055435.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.04113632171186128, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0007512669253628701, "learning_rate": 4.092165898617512e-06, "loss": 0.0, "num_tokens": 2060448.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.04132078952222837, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.001078595056242193, "learning_rate": 4.110599078341014e-06, "loss": 0.0, "num_tokens": 2065170.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 457.75, "completions/mean_terminated_length": 230.57144165039062, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.04150525733259546, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.000705373466189485, "learning_rate": 4.1290322580645165e-06, "loss": 0.0, "num_tokens": 2072344.0, "reward": 1.2083332538604736, "reward_std": 0.6943650841712952, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 436.0, "completions/mean_terminated_length": 436.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.041689725142962555, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0036429875399335288, "learning_rate": 4.147465437788019e-06, "loss": 0.0001, "num_tokens": 2081048.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 553.75, "completions/mean_terminated_length": 553.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.04187419295332964, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0008196749004127923, "learning_rate": 4.165898617511521e-06, "loss": 0.0, "num_tokens": 2091966.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.042058660763696736, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.001910282313474454, "learning_rate": 4.184331797235024e-06, "loss": 0.0001, "num_tokens": 2096362.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 459.5, "completions/mean_terminated_length": 459.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.04224312857406383, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.0005780758474429604, "learning_rate": 4.202764976958525e-06, "loss": 0.0, "num_tokens": 2109766.0, "reward": 1.9039256572723389, "reward_std": 0.05909234285354614, "rewards/fixed_code_pass_all_test_reward/mean": 0.9039256572723389, "rewards/fixed_code_pass_all_test_reward/std": 0.059092361479997635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.042427596384430916, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.0012948763542226516, "learning_rate": 4.221198156682028e-06, "loss": 0.0001, "num_tokens": 2120027.0, "reward": 1.3125, "reward_std": 0.6512351036071777, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 643.375, "completions/mean_terminated_length": 442.71429443359375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.04261206419479801, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.0007696978900639806, "learning_rate": 4.23963133640553e-06, "loss": 0.0, "num_tokens": 2129558.0, "reward": 0.8352272510528564, "reward_std": 0.4413256049156189, "rewards/fixed_code_pass_all_test_reward/mean": 0.08522727340459824, "rewards/fixed_code_pass_all_test_reward/std": 0.0985056608915329, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 561.25, "completions/mean_terminated_length": 561.25, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.042796532005165096, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.0009734336363180773, "learning_rate": 4.258064516129032e-06, "loss": 0.0, "num_tokens": 2144648.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 480.75, "completions/mean_terminated_length": 480.75, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.04298099981553219, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0008978125115390867, "learning_rate": 4.276497695852535e-06, "loss": 0.0, "num_tokens": 2155806.0, "reward": 1.5431034564971924, "reward_std": 0.4456154704093933, "rewards/fixed_code_pass_all_test_reward/mean": 0.6681034564971924, "rewards/fixed_code_pass_all_test_reward/std": 0.26434552669525146, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 517.25, "completions/mean_terminated_length": 517.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.04316546762589928, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.001383633745717816, "learning_rate": 4.294930875576037e-06, "loss": 0.0001, "num_tokens": 2168128.0, "reward": 1.2123016119003296, "reward_std": 0.6406171321868896, "rewards/fixed_code_pass_all_test_reward/mean": 0.4623015820980072, "rewards/fixed_code_pass_all_test_reward/std": 0.4040108919143677, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.04334993543626637, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.0009061945747816935, "learning_rate": 4.3133640552995395e-06, "loss": 0.0, "num_tokens": 2173720.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.04353440324663346, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.00369867637346033, "learning_rate": 4.331797235023042e-06, "loss": 0.0001, "num_tokens": 2180910.0, "reward": 0.9937500357627869, "reward_std": 0.6662461161613464, "rewards/fixed_code_pass_all_test_reward/mean": 0.4937499761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2555631101131439, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 736.625, "completions/mean_terminated_length": 549.2857666015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.043718871057000556, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.0015223033951770049, "learning_rate": 4.350230414746544e-06, "loss": 0.0001, "num_tokens": 2193475.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.04390333886736764, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0016937976288318168, "learning_rate": 4.368663594470047e-06, "loss": 0.0001, "num_tokens": 2197336.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.04408780667773474, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0010976589474012144, "learning_rate": 4.387096774193549e-06, "loss": 0.0, "num_tokens": 2205640.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 517.125, "completions/mean_terminated_length": 517.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.04427227448810182, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.00152742829232011, "learning_rate": 4.405529953917051e-06, "loss": 0.0001, "num_tokens": 2214385.0, "reward": 1.4545453786849976, "reward_std": 0.481045663356781, "rewards/fixed_code_pass_all_test_reward/mean": 0.7045454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.28645122051239014, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.04445674229846892, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.0015726226556580514, "learning_rate": 4.423963133640554e-06, "loss": 0.0001, "num_tokens": 2221321.0, "reward": 1.0509259700775146, "reward_std": 0.027556447312235832, "rewards/fixed_code_pass_all_test_reward/mean": 0.05092592537403107, "rewards/fixed_code_pass_all_test_reward/std": 0.027556437999010086, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.04464121010883601, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0009672840242274106, "learning_rate": 4.442396313364056e-06, "loss": 0.0, "num_tokens": 2226841.0, "reward": 1.3333333730697632, "reward_std": 0.5634361505508423, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 379.875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.0448256779192031, "frac_reward_zero_std": 1.0, "grad_norm": 0.01397705078125, "kl": 0.0014169239657348953, "learning_rate": 4.460829493087558e-06, "loss": 0.0001, "num_tokens": 2237448.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 317.75, "completions/mean_terminated_length": 317.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.04501014572957019, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.002318118786206469, "learning_rate": 4.479262672811061e-06, "loss": 0.0001, "num_tokens": 2246310.0, "reward": 1.6744791269302368, "reward_std": 0.3847423791885376, "rewards/fixed_code_pass_all_test_reward/mean": 0.6744791269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.3847424387931824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.045194613539937284, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0016356291307602078, "learning_rate": 4.497695852534562e-06, "loss": 0.0001, "num_tokens": 2254373.0, "reward": 1.5, "reward_std": 0.483615517616272, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.48361557722091675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 380.625, "completions/mean_terminated_length": 380.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.04537908135030437, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0018660046262084506, "learning_rate": 4.516129032258065e-06, "loss": 0.0001, "num_tokens": 2264386.0, "reward": 1.3035714626312256, "reward_std": 0.46721991896629333, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.22908106446266174, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 305.5, "completions/mean_terminated_length": 305.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.045563549160671464, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.005900959156861063, "learning_rate": 4.534562211981567e-06, "loss": 0.0002, "num_tokens": 2273526.0, "reward": 1.1875, "reward_std": 0.5303300619125366, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.04574801697103855, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.0023254387742781546, "learning_rate": 4.5529953917050696e-06, "loss": 0.0001, "num_tokens": 2279606.0, "reward": 1.6875, "reward_std": 0.43129098415374756, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.43129095435142517, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.045932484781405644, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.004013640558696352, "learning_rate": 4.571428571428572e-06, "loss": 0.0002, "num_tokens": 2285673.0, "reward": 1.7421875, "reward_std": 0.27637138962745667, "rewards/fixed_code_pass_all_test_reward/mean": 0.7421875, "rewards/fixed_code_pass_all_test_reward/std": 0.27637138962745667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 873.375, "completions/mean_terminated_length": 873.375, "completions/min_length": 546.0, "completions/min_terminated_length": 546.0, "epoch": 0.04611695259177274, "frac_reward_zero_std": 1.0, "grad_norm": 0.019775390625, "kl": 0.001434162666555494, "learning_rate": 4.589861751152074e-06, "loss": 0.0001, "num_tokens": 2299244.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.046301420402139824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01007080078125, "kl": 0.0007213067365228198, "learning_rate": 4.608294930875577e-06, "loss": 0.0, "num_tokens": 2306348.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.04648588821250692, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.0020470824019866996, "learning_rate": 4.626728110599078e-06, "loss": 0.0001, "num_tokens": 2310249.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.04667035602287401, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.004192173364572227, "learning_rate": 4.6451612903225815e-06, "loss": 0.0002, "num_tokens": 2313993.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 549.0, "completions/mean_terminated_length": 549.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.0468548238332411, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.0034435808920534328, "learning_rate": 4.663594470046083e-06, "loss": 0.0001, "num_tokens": 2323729.0, "reward": 0.8857142925262451, "reward_std": 0.35913729667663574, "rewards/fixed_code_pass_all_test_reward/mean": 0.010714286006987095, "rewards/fixed_code_pass_all_test_reward/std": 0.030304577201604843, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.04703929164360819, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0010774861366371624, "learning_rate": 4.682027649769585e-06, "loss": 0.0, "num_tokens": 2328263.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 492.5, "completions/mean_terminated_length": 492.5, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.04722375945397528, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.0019477826026559342, "learning_rate": 4.700460829493088e-06, "loss": 0.0001, "num_tokens": 2336891.0, "reward": 1.1500000953674316, "reward_std": 0.29760950803756714, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.29760950803756714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 422.5, "completions/mean_terminated_length": 422.5, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.04740822726434237, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.0012998752063140273, "learning_rate": 4.71889400921659e-06, "loss": 0.0001, "num_tokens": 2345839.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.047592695074709465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0272216796875, "kl": 0.002006904782319907, "learning_rate": 4.7373271889400925e-06, "loss": 0.0001, "num_tokens": 2355136.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.04777716288507655, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0020117964450037107, "learning_rate": 4.755760368663595e-06, "loss": 0.0001, "num_tokens": 2359440.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.047961630695443645, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.003070744496653788, "learning_rate": 4.774193548387097e-06, "loss": 0.0001, "num_tokens": 2363433.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 477.0, "completions/mean_terminated_length": 477.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.04814609850581074, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.0019301101892779116, "learning_rate": 4.7926267281106e-06, "loss": 0.0001, "num_tokens": 2372801.0, "reward": 1.9285714626312256, "reward_std": 0.15272073447704315, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.15272070467472076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 478.25, "completions/mean_terminated_length": 478.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.048330566316177825, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.001288857627514517, "learning_rate": 4.811059907834102e-06, "loss": 0.0001, "num_tokens": 2384587.0, "reward": 1.6130952835083008, "reward_std": 0.4839085340499878, "rewards/fixed_code_pass_all_test_reward/mean": 0.613095223903656, "rewards/fixed_code_pass_all_test_reward/std": 0.4839085638523102, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.04851503412654492, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.002426694249152206, "learning_rate": 4.8294930875576044e-06, "loss": 0.0001, "num_tokens": 2392219.0, "reward": 1.946874976158142, "reward_std": 0.15026018023490906, "rewards/fixed_code_pass_all_test_reward/mean": 0.9468749761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.15026019513607025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.048699501936912006, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0017713702836772427, "learning_rate": 4.847926267281106e-06, "loss": 0.0001, "num_tokens": 2401813.0, "reward": 1.277438998222351, "reward_std": 0.077059805393219, "rewards/fixed_code_pass_all_test_reward/mean": 0.27743902802467346, "rewards/fixed_code_pass_all_test_reward/std": 0.07705983519554138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 500.75, "completions/mean_terminated_length": 279.71429443359375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.0488839697472791, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.002141762764949817, "learning_rate": 4.866359447004609e-06, "loss": 0.0001, "num_tokens": 2409083.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 480.375, "completions/mean_terminated_length": 480.375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.04906843755764619, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0015990414540283382, "learning_rate": 4.884792626728111e-06, "loss": 0.0001, "num_tokens": 2417918.0, "reward": 1.3392857313156128, "reward_std": 0.5503113269805908, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.21257823705673218, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 460.875, "completions/mean_terminated_length": 460.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.04925290536801328, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.00690473157010274, "learning_rate": 4.903225806451613e-06, "loss": 0.0003, "num_tokens": 2430333.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 551.875, "completions/mean_terminated_length": 551.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.04943737317838037, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0015725677512818947, "learning_rate": 4.9216589861751155e-06, "loss": 0.0001, "num_tokens": 2442668.0, "reward": 1.019230842590332, "reward_std": 0.05439284071326256, "rewards/fixed_code_pass_all_test_reward/mean": 0.01923076994717121, "rewards/fixed_code_pass_all_test_reward/std": 0.05439283326268196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.049621840988747466, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.0010252786814817227, "learning_rate": 4.940092165898618e-06, "loss": 0.0, "num_tokens": 2448225.0, "reward": 1.7142856121063232, "reward_std": 0.42515644431114197, "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.31886735558509827, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04980630879911455, "frac_reward_zero_std": 1.0, "grad_norm": 0.018798828125, "kl": 0.0011696503061102703, "learning_rate": 4.95852534562212e-06, "loss": 0.0, "num_tokens": 2454034.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 413.625, "completions/mean_terminated_length": 413.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.049990776609481646, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.003565599057765212, "learning_rate": 4.976958525345623e-06, "loss": 0.0001, "num_tokens": 2462727.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 495.75, "completions/mean_terminated_length": 495.75, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.05017524441984873, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0027229955594521016, "learning_rate": 4.995391705069125e-06, "loss": 0.0001, "num_tokens": 2473381.0, "reward": 1.4375, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 407.375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.050359712230215826, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.004993542192096356, "learning_rate": 5.013824884792627e-06, "loss": 0.0002, "num_tokens": 2481512.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 386.125, "completions/mean_terminated_length": 386.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.05054418004058292, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.0013789842996629886, "learning_rate": 5.032258064516129e-06, "loss": 0.0001, "num_tokens": 2491793.0, "reward": 1.2000000476837158, "reward_std": 0.21380895376205444, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.21380901336669922, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 679.25, "completions/mean_terminated_length": 679.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.05072864785095001, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.002456085945595987, "learning_rate": 5.050691244239631e-06, "loss": 0.0001, "num_tokens": 2503851.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.0509131156613171, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0026333942369092256, "learning_rate": 5.0691244239631346e-06, "loss": 0.0001, "num_tokens": 2510483.0, "reward": 1.25, "reward_std": 0.0609799362719059, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.060979947447776794, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 825.75, "completions/mean_terminated_length": 825.75, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.051097583471684194, "frac_reward_zero_std": 0.0, "grad_norm": 0.486328125, "kl": 0.0008318830896314466, "learning_rate": 5.087557603686636e-06, "loss": 0.0, "num_tokens": 2524321.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.05128205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.002531564299715683, "learning_rate": 5.1059907834101385e-06, "loss": 0.0001, "num_tokens": 2534046.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 190.1428680419922, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.051466519092418374, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0036303385058999993, "learning_rate": 5.124423963133641e-06, "loss": 0.0001, "num_tokens": 2540289.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 704.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.05165098690278547, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0036192442967148963, "learning_rate": 5.142857142857142e-06, "loss": 0.0001, "num_tokens": 2555801.0, "reward": 0.855769157409668, "reward_std": 0.7256180047988892, "rewards/fixed_code_pass_all_test_reward/mean": 0.23076923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.24670268595218658, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.051835454713152554, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0028796335827792063, "learning_rate": 5.161290322580646e-06, "loss": 0.0001, "num_tokens": 2559824.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.05201992252351965, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0028399306756909937, "learning_rate": 5.179723502304148e-06, "loss": 0.0001, "num_tokens": 2563606.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 407.5, "completions/mean_terminated_length": 407.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.052204390333886734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0128173828125, "kl": 0.0011331468558637425, "learning_rate": 5.1981566820276495e-06, "loss": 0.0, "num_tokens": 2575122.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 402.0, "completions/mean_terminated_length": 402.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.05238885814425383, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0017614895186852664, "learning_rate": 5.216589861751153e-06, "loss": 0.0001, "num_tokens": 2583034.0, "reward": 1.5, "reward_std": 0.35040876269340515, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 515.875, "completions/mean_terminated_length": 515.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.05257332595462092, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0010412275369162671, "learning_rate": 5.235023041474655e-06, "loss": 0.0, "num_tokens": 2596281.0, "reward": 1.1646342277526855, "reward_std": 0.8171641826629639, "rewards/fixed_code_pass_all_test_reward/mean": 0.4146341383457184, "rewards/fixed_code_pass_all_test_reward/std": 0.4653362035751343, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.05275779376498801, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.001470310686272569, "learning_rate": 5.253456221198157e-06, "loss": 0.0001, "num_tokens": 2601865.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0529422615753551, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.002124873222783208, "learning_rate": 5.271889400921659e-06, "loss": 0.0001, "num_tokens": 2607035.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.053126729385722195, "frac_reward_zero_std": 1.0, "grad_norm": 0.029052734375, "kl": 0.002531195234041661, "learning_rate": 5.290322580645162e-06, "loss": 0.0001, "num_tokens": 2611099.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.05331119719608928, "frac_reward_zero_std": 1.0, "grad_norm": 0.035888671875, "kl": 0.0032399199990322813, "learning_rate": 5.308755760368665e-06, "loss": 0.0001, "num_tokens": 2615141.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 464.375, "completions/mean_terminated_length": 464.375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.053495665006456375, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.0015845611487748101, "learning_rate": 5.327188940092166e-06, "loss": 0.0001, "num_tokens": 2623776.0, "reward": 1.4642857313156128, "reward_std": 0.28656336665153503, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.28656336665153503, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.05368013281682346, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0026175471139140427, "learning_rate": 5.345622119815669e-06, "loss": 0.0001, "num_tokens": 2628612.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.053864600627190555, "frac_reward_zero_std": 0.0, "grad_norm": 3.703125, "kl": 0.008950897346949205, "learning_rate": 5.364055299539172e-06, "loss": 0.0004, "num_tokens": 2632540.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 748.125, "completions/mean_terminated_length": 562.4285888671875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.05404906843755765, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.0016333454186678864, "learning_rate": 5.382488479262673e-06, "loss": 0.0001, "num_tokens": 2646797.0, "reward": 1.3562500476837158, "reward_std": 0.7317047715187073, "rewards/fixed_code_pass_all_test_reward/mean": 0.6062500476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.2982671558856964, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 500.125, "completions/mean_terminated_length": 279.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.054233536247924735, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.0013850810464646202, "learning_rate": 5.400921658986176e-06, "loss": 0.0001, "num_tokens": 2656870.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.05441800405829183, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.003704235816258006, "learning_rate": 5.419354838709678e-06, "loss": 0.0001, "num_tokens": 2666049.0, "reward": 1.442460298538208, "reward_std": 0.48742881417274475, "rewards/fixed_code_pass_all_test_reward/mean": 0.442460298538208, "rewards/fixed_code_pass_all_test_reward/std": 0.48742881417274475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.05460247186865892, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.0048446366272401065, "learning_rate": 5.43778801843318e-06, "loss": 0.0002, "num_tokens": 2672257.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 463.75, "completions/mean_terminated_length": 463.75, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.05478693967902601, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0022193394688656554, "learning_rate": 5.456221198156683e-06, "loss": 0.0001, "num_tokens": 2681255.0, "reward": 1.5957791805267334, "reward_std": 0.1994423270225525, "rewards/fixed_code_pass_all_test_reward/mean": 0.5957792401313782, "rewards/fixed_code_pass_all_test_reward/std": 0.19944234192371368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.0549714074893931, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.0016861603508004919, "learning_rate": 5.474654377880185e-06, "loss": 0.0001, "num_tokens": 2686894.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.05515587529976019, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0034283193526789546, "learning_rate": 5.493087557603687e-06, "loss": 0.0001, "num_tokens": 2693891.0, "reward": 1.6574074029922485, "reward_std": 0.41373157501220703, "rewards/fixed_code_pass_all_test_reward/mean": 0.7824074029922485, "rewards/fixed_code_pass_all_test_reward/std": 0.32915517687797546, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.05534034311012728, "frac_reward_zero_std": 1.0, "grad_norm": 0.023193359375, "kl": 0.002563972091593314, "learning_rate": 5.511520737327189e-06, "loss": 0.0001, "num_tokens": 2699527.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 381.875, "completions/mean_terminated_length": 381.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.055524810920494376, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.0031164427782641724, "learning_rate": 5.529953917050692e-06, "loss": 0.0001, "num_tokens": 2710078.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.05570927873086146, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0028207486757310107, "learning_rate": 5.548387096774194e-06, "loss": 0.0001, "num_tokens": 2719122.0, "reward": 1.21484375, "reward_std": 0.4048525094985962, "rewards/fixed_code_pass_all_test_reward/mean": 0.21484375, "rewards/fixed_code_pass_all_test_reward/std": 0.4048525094985962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.055893746541228556, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.005300380915286951, "learning_rate": 5.566820276497696e-06, "loss": 0.0002, "num_tokens": 2728234.0, "reward": 1.021276593208313, "reward_std": 0.7291032671928406, "rewards/fixed_code_pass_all_test_reward/mean": 0.271276593208313, "rewards/fixed_code_pass_all_test_reward/std": 0.4525967538356781, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 368.125, "completions/mean_terminated_length": 368.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.05607821435159565, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.003201348183210939, "learning_rate": 5.585253456221199e-06, "loss": 0.0001, "num_tokens": 2734163.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 585.25, "completions/mean_terminated_length": 585.25, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.056262682161962736, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.0038694553077220917, "learning_rate": 5.6036866359447e-06, "loss": 0.0002, "num_tokens": 2746677.0, "reward": 0.692307710647583, "reward_std": 0.6855592727661133, "rewards/fixed_code_pass_all_test_reward/mean": 0.19230769574642181, "rewards/fixed_code_pass_all_test_reward/std": 0.22893041372299194, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 603.75, "completions/mean_terminated_length": 603.75, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.05644714997232983, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.002914172764576506, "learning_rate": 5.6221198156682035e-06, "loss": 0.0001, "num_tokens": 2758867.0, "reward": 0.7434210777282715, "reward_std": 0.4719580113887787, "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.1839417964220047, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.056631617782696916, "frac_reward_zero_std": 1.0, "grad_norm": 0.02587890625, "kl": 0.002347299494431354, "learning_rate": 5.640552995391706e-06, "loss": 0.0001, "num_tokens": 2768447.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.05681608559306401, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.004283955306163989, "learning_rate": 5.658986175115207e-06, "loss": 0.0002, "num_tokens": 2775082.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 222.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.0570005534034311, "frac_reward_zero_std": 1.0, "grad_norm": 0.038818359375, "kl": 0.003466296271653846, "learning_rate": 5.677419354838711e-06, "loss": 0.0001, "num_tokens": 2779626.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.05718502121379819, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.004007494033430703, "learning_rate": 5.695852534562213e-06, "loss": 0.0002, "num_tokens": 2783392.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 509.75, "completions/mean_terminated_length": 509.75, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.057369489024165284, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.0033406703441869467, "learning_rate": 5.7142857142857145e-06, "loss": 0.0001, "num_tokens": 2792878.0, "reward": 1.3725961446762085, "reward_std": 0.30853894352912903, "rewards/fixed_code_pass_all_test_reward/mean": 0.3725961446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.30853894352912903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 320.875, "completions/mean_terminated_length": 320.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.05755395683453238, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0027820689138025045, "learning_rate": 5.732718894009217e-06, "loss": 0.0001, "num_tokens": 2801829.0, "reward": 1.274999976158142, "reward_std": 0.4527692198753357, "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.45276927947998047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 413.625, "completions/mean_terminated_length": 413.625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.057738424644899464, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.0015533613477600738, "learning_rate": 5.75115207373272e-06, "loss": 0.0001, "num_tokens": 2809290.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 668.125, "completions/mean_terminated_length": 668.125, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.05792289245526656, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.00535172458330635, "learning_rate": 5.769585253456222e-06, "loss": 0.0002, "num_tokens": 2821251.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.058107360265633644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0308837890625, "kl": 0.0026180390486842953, "learning_rate": 5.788018433179724e-06, "loss": 0.0001, "num_tokens": 2826263.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 589.125, "completions/mean_terminated_length": 589.125, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.05829182807600074, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.00468234814616153, "learning_rate": 5.806451612903226e-06, "loss": 0.0002, "num_tokens": 2835680.0, "reward": 1.4485294818878174, "reward_std": 0.5965276956558228, "rewards/fixed_code_pass_all_test_reward/mean": 0.5735294222831726, "rewards/fixed_code_pass_all_test_reward/std": 0.3663422167301178, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.05847629588636783, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.0038610458141192794, "learning_rate": 5.824884792626728e-06, "loss": 0.0002, "num_tokens": 2842879.0, "reward": 1.0592105388641357, "reward_std": 0.46092891693115234, "rewards/fixed_code_pass_all_test_reward/mean": 0.18421052396297455, "rewards/fixed_code_pass_all_test_reward/std": 0.18661163747310638, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.05866076369673492, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.00386939563031774, "learning_rate": 5.843317972350231e-06, "loss": 0.0002, "num_tokens": 2847086.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.05884523150710201, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.003912389889592305, "learning_rate": 5.8617511520737336e-06, "loss": 0.0002, "num_tokens": 2853679.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.059029699317469105, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.0047894025628920645, "learning_rate": 5.880184331797235e-06, "loss": 0.0002, "num_tokens": 2861231.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 769.625, "completions/mean_terminated_length": 587.0, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.05921416712783619, "frac_reward_zero_std": 0.0, "grad_norm": 0.416015625, "kl": 0.001476364897825988, "learning_rate": 5.8986175115207375e-06, "loss": 0.0001, "num_tokens": 2876108.0, "reward": 1.4777777194976807, "reward_std": 0.6443897485733032, "rewards/fixed_code_pass_all_test_reward/mean": 0.6027777791023254, "rewards/fixed_code_pass_all_test_reward/std": 0.343534380197525, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 387.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.059398634938203285, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.004371067494503222, "learning_rate": 5.917050691244241e-06, "loss": 0.0002, "num_tokens": 2885529.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 384.5, "completions/mean_terminated_length": 146.85714721679688, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.05958310274857037, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.003469252842478454, "learning_rate": 5.935483870967742e-06, "loss": 0.0001, "num_tokens": 2891517.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.059767570558937465, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.004643242893507704, "learning_rate": 5.953917050691245e-06, "loss": 0.0002, "num_tokens": 2901291.0, "reward": 1.1428570747375488, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 528.75, "completions/mean_terminated_length": 528.75, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.05995203836930456, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.002668224580702372, "learning_rate": 5.972350230414747e-06, "loss": 0.0001, "num_tokens": 2914641.0, "reward": 1.0833333730697632, "reward_std": 0.2357023060321808, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022911310196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.060136506179671645, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.008318491956742946, "learning_rate": 5.9907834101382485e-06, "loss": 0.0003, "num_tokens": 2918789.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.06032097399003874, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0056224293512059376, "learning_rate": 6.009216589861752e-06, "loss": 0.0002, "num_tokens": 2925697.0, "reward": 1.3863636255264282, "reward_std": 0.34274399280548096, "rewards/fixed_code_pass_all_test_reward/mean": 0.3863636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.34274399280548096, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.06050544180040583, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0036366573767736554, "learning_rate": 6.027649769585254e-06, "loss": 0.0001, "num_tokens": 2933858.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.06068990961077292, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.005450486583868042, "learning_rate": 6.046082949308756e-06, "loss": 0.0002, "num_tokens": 2944551.0, "reward": 1.3977272510528564, "reward_std": 0.5001475811004639, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772725105285645, "rewards/fixed_code_pass_all_test_reward/std": 0.5001475811004639, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.06087437742114001, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.004170244283159263, "learning_rate": 6.064516129032259e-06, "loss": 0.0002, "num_tokens": 2954682.0, "reward": 1.813636302947998, "reward_std": 0.12206411361694336, "rewards/fixed_code_pass_all_test_reward/mean": 0.8136363625526428, "rewards/fixed_code_pass_all_test_reward/std": 0.12206411361694336, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.0610588452315071, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.008031944889808074, "learning_rate": 6.082949308755761e-06, "loss": 0.0003, "num_tokens": 2960441.0, "reward": 1.6750000715255737, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.6749999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.3535534143447876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.06124331304187419, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.007419643545290455, "learning_rate": 6.101382488479263e-06, "loss": 0.0003, "num_tokens": 2970032.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.061427780852241286, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.005558678589295596, "learning_rate": 6.119815668202765e-06, "loss": 0.0002, "num_tokens": 2976407.0, "reward": 1.1590909957885742, "reward_std": 0.32867667078971863, "rewards/fixed_code_pass_all_test_reward/mean": 0.15909090638160706, "rewards/fixed_code_pass_all_test_reward/std": 0.32867664098739624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 1174.75, "completions/mean_terminated_length": 1174.75, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.06161224866260837, "frac_reward_zero_std": 0.0, "grad_norm": 0.453125, "kl": 0.000754260050598532, "learning_rate": 6.1382488479262684e-06, "loss": 0.0, "num_tokens": 3003333.0, "reward": 1.6739130020141602, "reward_std": 0.3109314739704132, "rewards/fixed_code_pass_all_test_reward/mean": 0.6739130020141602, "rewards/fixed_code_pass_all_test_reward/std": 0.3109314441680908, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.061796716472975466, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.00738844892475754, "learning_rate": 6.15668202764977e-06, "loss": 0.0003, "num_tokens": 3010109.0, "reward": 0.7749999761581421, "reward_std": 0.48080289363861084, "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, "rewards/fixed_code_pass_all_test_reward/std": 0.05099019780755043, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 503.0, "completions/mean_terminated_length": 503.0, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.06198118428334256, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.003005138802109286, "learning_rate": 6.175115207373272e-06, "loss": 0.0001, "num_tokens": 3024045.0, "reward": 1.3011362552642822, "reward_std": 0.2535898983478546, "rewards/fixed_code_pass_all_test_reward/mean": 0.3011363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.253589928150177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.062165652093709646, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.010897418716922402, "learning_rate": 6.193548387096775e-06, "loss": 0.0004, "num_tokens": 3027725.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06235011990407674, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.0038318773731589317, "learning_rate": 6.211981566820276e-06, "loss": 0.0002, "num_tokens": 3037226.0, "reward": 1.0, "reward_std": 0.623354971408844, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.21380899846553802, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.06253458771444383, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.005482116554048844, "learning_rate": 6.2304147465437795e-06, "loss": 0.0002, "num_tokens": 3046671.0, "reward": 1.6324257850646973, "reward_std": 0.5073834657669067, "rewards/fixed_code_pass_all_test_reward/mean": 0.6324257850646973, "rewards/fixed_code_pass_all_test_reward/std": 0.5073834657669067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.06271905552481093, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.004501984847593121, "learning_rate": 6.248847926267282e-06, "loss": 0.0002, "num_tokens": 3054717.0, "reward": 1.3203125, "reward_std": 0.37565046548843384, "rewards/fixed_code_pass_all_test_reward/mean": 0.4453125, "rewards/fixed_code_pass_all_test_reward/std": 0.2227003127336502, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06290352333517801, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.003907048871042207, "learning_rate": 6.267281105990783e-06, "loss": 0.0002, "num_tokens": 3066745.0, "reward": 1.6853448152542114, "reward_std": 0.28281721472740173, "rewards/fixed_code_pass_all_test_reward/mean": 0.6853448152542114, "rewards/fixed_code_pass_all_test_reward/std": 0.28281718492507935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 547.875, "completions/mean_terminated_length": 547.875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.0630879911455451, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.005283918377244845, "learning_rate": 6.285714285714286e-06, "loss": 0.0002, "num_tokens": 3081608.0, "reward": 0.7791666984558105, "reward_std": 0.4830254018306732, "rewards/fixed_code_pass_all_test_reward/mean": 0.02916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.04859127476811409, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06327245895591219, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.00352828815812245, "learning_rate": 6.304147465437789e-06, "loss": 0.0001, "num_tokens": 3086447.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 543.25, "completions/mean_terminated_length": 543.25, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.06345692676627929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0223388671875, "kl": 0.0029360177359194495, "learning_rate": 6.3225806451612906e-06, "loss": 0.0001, "num_tokens": 3096713.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 308.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.06364139457664637, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.004165337551967241, "learning_rate": 6.341013824884793e-06, "loss": 0.0002, "num_tokens": 3103606.0, "reward": 1.5833332538604736, "reward_std": 0.4671414792537689, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.31405800580978394, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.06382586238701346, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.004217600362608209, "learning_rate": 6.359447004608295e-06, "loss": 0.0002, "num_tokens": 3111296.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.06401033019738056, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.00600789362215437, "learning_rate": 6.377880184331797e-06, "loss": 0.0002, "num_tokens": 3119138.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 430.25, "completions/mean_terminated_length": 430.25, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.06419479800774765, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.003173600045556668, "learning_rate": 6.3963133640553e-06, "loss": 0.0001, "num_tokens": 3128404.0, "reward": 1.34375, "reward_std": 0.4614343047142029, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.4416610598564148, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 826.875, "completions/mean_terminated_length": 826.875, "completions/min_length": 567.0, "completions/min_terminated_length": 567.0, "epoch": 0.06437926581811473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517578125, "kl": 0.004105518906726502, "learning_rate": 6.4147465437788025e-06, "loss": 0.0002, "num_tokens": 3144963.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.06456373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.006416238553356379, "learning_rate": 6.433179723502304e-06, "loss": 0.0003, "num_tokens": 3153064.0, "reward": 1.0972223281860352, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.0972222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.06474820143884892, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.004348761518485844, "learning_rate": 6.451612903225806e-06, "loss": 0.0002, "num_tokens": 3157559.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 470.375, "completions/mean_terminated_length": 245.00001525878906, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06493266924921601, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.006047164803021587, "learning_rate": 6.47004608294931e-06, "loss": 0.0002, "num_tokens": 3166946.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.06511713705958311, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.020688183431047946, "learning_rate": 6.488479262672812e-06, "loss": 0.0008, "num_tokens": 3172228.0, "reward": 1.6136363744735718, "reward_std": 0.33313649892807007, "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.33313652873039246, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0653016048699502, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0035269658983452246, "learning_rate": 6.5069124423963135e-06, "loss": 0.0001, "num_tokens": 3177238.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.06548607268031728, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.005910440318984911, "learning_rate": 6.525345622119817e-06, "loss": 0.0002, "num_tokens": 3181868.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.06567054049068438, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.00930082905688323, "learning_rate": 6.543778801843319e-06, "loss": 0.0004, "num_tokens": 3192702.0, "reward": 1.4500000476837158, "reward_std": 0.47207745909690857, "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.47207748889923096, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 321.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.06585500830105147, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.00542910625517834, "learning_rate": 6.562211981566821e-06, "loss": 0.0002, "num_tokens": 3201861.0, "reward": 1.392045497894287, "reward_std": 0.5055404305458069, "rewards/fixed_code_pass_all_test_reward/mean": 0.39204543828964233, "rewards/fixed_code_pass_all_test_reward/std": 0.5055404305458069, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 417.25, "completions/mean_terminated_length": 417.25, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.06603947611141855, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.006025874783517793, "learning_rate": 6.580645161290323e-06, "loss": 0.0002, "num_tokens": 3210375.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 388.875, "completions/mean_terminated_length": 388.875, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.06622394392178566, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.0059027707611676306, "learning_rate": 6.599078341013826e-06, "loss": 0.0002, "num_tokens": 3220742.0, "reward": 1.7857142686843872, "reward_std": 0.5060566067695618, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.16121803224086761, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.06640841173215274, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.00841543628484942, "learning_rate": 6.617511520737328e-06, "loss": 0.0003, "num_tokens": 3228940.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 556.0, "completions/mean_terminated_length": 556.0, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.06659287954251983, "frac_reward_zero_std": 1.0, "grad_norm": 0.04248046875, "kl": 0.003931356506654993, "learning_rate": 6.63594470046083e-06, "loss": 0.0002, "num_tokens": 3245828.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.06677734735288691, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.011652858345769346, "learning_rate": 6.6543778801843326e-06, "loss": 0.0005, "num_tokens": 3250540.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.06696181516325402, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.005880098877241835, "learning_rate": 6.672811059907834e-06, "loss": 0.0002, "num_tokens": 3258895.0, "reward": 1.904761791229248, "reward_std": 0.154827281832695, "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.1548272967338562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 474.5, "completions/mean_terminated_length": 474.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.0671462829736211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0142822265625, "kl": 0.0012356218394415919, "learning_rate": 6.691244239631337e-06, "loss": 0.0, "num_tokens": 3267659.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.06733075078398819, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.008868088538292795, "learning_rate": 6.70967741935484e-06, "loss": 0.0004, "num_tokens": 3276162.0, "reward": 1.3900861740112305, "reward_std": 0.7316022515296936, "rewards/fixed_code_pass_all_test_reward/mean": 0.6400862336158752, "rewards/fixed_code_pass_all_test_reward/std": 0.42055612802505493, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.06751521859435529, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.008630494150565937, "learning_rate": 6.728110599078341e-06, "loss": 0.0003, "num_tokens": 3282464.0, "reward": 1.1541666984558105, "reward_std": 0.34592297673225403, "rewards/fixed_code_pass_all_test_reward/mean": 0.15416666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.34592294692993164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 233.875, "completions/mean_terminated_length": 233.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.06769968640472238, "frac_reward_zero_std": 1.0, "grad_norm": 0.049560546875, "kl": 0.0059770430088974535, "learning_rate": 6.746543778801844e-06, "loss": 0.0002, "num_tokens": 3290935.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.06788415421508946, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.005427137948572636, "learning_rate": 6.764976958525347e-06, "loss": 0.0002, "num_tokens": 3298911.0, "reward": 1.5625, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.06806862202545656, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.004619544371962547, "learning_rate": 6.783410138248848e-06, "loss": 0.0002, "num_tokens": 3312248.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.06825308983582365, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.00374437149730511, "learning_rate": 6.801843317972351e-06, "loss": 0.0001, "num_tokens": 3316453.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 109.875, "completions/mean_terminated_length": 109.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.06843755764619074, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.00852827716153115, "learning_rate": 6.820276497695853e-06, "loss": 0.0003, "num_tokens": 3320212.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.06862202545655784, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.006679936865111813, "learning_rate": 6.838709677419355e-06, "loss": 0.0003, "num_tokens": 3326369.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.06880649326692492, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.003922005329513922, "learning_rate": 6.857142857142858e-06, "loss": 0.0002, "num_tokens": 3330752.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.06899096107729201, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.01255025013233535, "learning_rate": 6.87557603686636e-06, "loss": 0.0005, "num_tokens": 3340221.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.06917542888765911, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0056608308223076165, "learning_rate": 6.894009216589862e-06, "loss": 0.0002, "num_tokens": 3345115.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 645.0, "completions/mean_terminated_length": 444.5714416503906, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.0693598966980262, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.004642765736207366, "learning_rate": 6.912442396313365e-06, "loss": 0.0002, "num_tokens": 3356483.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 523.625, "completions/mean_terminated_length": 523.625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.06954436450839328, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.003307518913061358, "learning_rate": 6.9308755760368674e-06, "loss": 0.0001, "num_tokens": 3369144.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 434.5, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.06972883231876037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0341796875, "kl": 0.00414440970052965, "learning_rate": 6.949308755760369e-06, "loss": 0.0002, "num_tokens": 3377484.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 358.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.06991330012912747, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.006078801321564242, "learning_rate": 6.967741935483871e-06, "loss": 0.0002, "num_tokens": 3384643.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.07009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 3.796875, "kl": 0.007344163459492847, "learning_rate": 6.986175115207375e-06, "loss": 0.0003, "num_tokens": 3388313.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 434.25, "completions/mean_terminated_length": 434.25, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.07028223574986164, "frac_reward_zero_std": 1.0, "grad_norm": 0.034912109375, "kl": 0.004899154737358913, "learning_rate": 7.004608294930876e-06, "loss": 0.0002, "num_tokens": 3396683.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 492.25, "completions/mean_terminated_length": 492.25, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.07046670356022874, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.011205047485418618, "learning_rate": 7.0230414746543785e-06, "loss": 0.0004, "num_tokens": 3410477.0, "reward": 1.2999999523162842, "reward_std": 0.440778523683548, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.44077855348587036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.07065117137059583, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.008305327268317342, "learning_rate": 7.041474654377881e-06, "loss": 0.0003, "num_tokens": 3419649.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 560.375, "completions/mean_terminated_length": 347.8571472167969, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.07083563918096292, "frac_reward_zero_std": 0.0, "grad_norm": 0.5390625, "kl": 0.005096556022181176, "learning_rate": 7.059907834101382e-06, "loss": 0.0002, "num_tokens": 3431308.0, "reward": 1.125, "reward_std": 0.4576992690563202, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.11428051441907883, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.07102010699133002, "frac_reward_zero_std": 0.0, "grad_norm": 4.1875, "kl": 0.01666510189534165, "learning_rate": 7.078341013824886e-06, "loss": 0.0007, "num_tokens": 3435237.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 468.75, "completions/mean_terminated_length": 468.75, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.0712045748016971, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.004176500806352124, "learning_rate": 7.096774193548388e-06, "loss": 0.0002, "num_tokens": 3449651.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.07138904261206419, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.009258315694751218, "learning_rate": 7.1152073732718896e-06, "loss": 0.0004, "num_tokens": 3458262.0, "reward": 1.125, "reward_std": 0.05563493072986603, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.055634867399930954, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.07157351042243129, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.006676415679976344, "learning_rate": 7.133640552995392e-06, "loss": 0.0003, "num_tokens": 3467653.0, "reward": 1.5125000476837158, "reward_std": 0.3554283678531647, "rewards/fixed_code_pass_all_test_reward/mean": 0.5125000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.35542842745780945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 564.625, "completions/mean_terminated_length": 564.625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.07175797823279838, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.0036779690417461097, "learning_rate": 7.152073732718895e-06, "loss": 0.0001, "num_tokens": 3482618.0, "reward": 1.0125000476837158, "reward_std": 0.5350974798202515, "rewards/fixed_code_pass_all_test_reward/mean": 0.13750000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.34934747219085693, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1457.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 368.5, "completions/mean_terminated_length": 368.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.07194244604316546, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.009317669726442546, "learning_rate": 7.170506912442397e-06, "loss": 0.0004, "num_tokens": 3491942.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.07212691385353256, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.010448655200889334, "learning_rate": 7.188940092165899e-06, "loss": 0.0004, "num_tokens": 3501784.0, "reward": 1.2984693050384521, "reward_std": 0.6190042495727539, "rewards/fixed_code_pass_all_test_reward/mean": 0.4234693646430969, "rewards/fixed_code_pass_all_test_reward/std": 0.37037143111228943, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 444.125, "completions/mean_terminated_length": 444.125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.07231138166389965, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.008311654266435653, "learning_rate": 7.2073732718894015e-06, "loss": 0.0003, "num_tokens": 3510585.0, "reward": 1.019230842590332, "reward_std": 0.6011400818824768, "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.17804233729839325, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.07249584947426674, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.0074505830416455865, "learning_rate": 7.225806451612903e-06, "loss": 0.0003, "num_tokens": 3521328.0, "reward": 1.0879629850387573, "reward_std": 0.04399021714925766, "rewards/fixed_code_pass_all_test_reward/mean": 0.08796296268701553, "rewards/fixed_code_pass_all_test_reward/std": 0.043990183621644974, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.07268031728463382, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0050389283715048805, "learning_rate": 7.244239631336406e-06, "loss": 0.0002, "num_tokens": 3531725.0, "reward": 1.3095238208770752, "reward_std": 0.4348659813404083, "rewards/fixed_code_pass_all_test_reward/mean": 0.3095237910747528, "rewards/fixed_code_pass_all_test_reward/std": 0.4348660111427307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.07286478509500093, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.0091605992638506, "learning_rate": 7.262672811059909e-06, "loss": 0.0004, "num_tokens": 3538525.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.07304925290536801, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.00791086972458288, "learning_rate": 7.28110599078341e-06, "loss": 0.0003, "num_tokens": 3546446.0, "reward": 1.7333333492279053, "reward_std": 0.28507864475250244, "rewards/fixed_code_pass_all_test_reward/mean": 0.7333333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.28507867455482483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 697.25, "completions/mean_terminated_length": 247.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.0732337207157351, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.005798906306154095, "learning_rate": 7.299539170506913e-06, "loss": 0.0002, "num_tokens": 3558944.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.0734181885261022, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.007382256560958922, "learning_rate": 7.317972350230416e-06, "loss": 0.0003, "num_tokens": 3564235.0, "reward": 1.236918568611145, "reward_std": 0.3379781246185303, "rewards/fixed_code_pass_all_test_reward/mean": 0.2369185984134674, "rewards/fixed_code_pass_all_test_reward/std": 0.33797815442085266, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 469.0, "completions/mean_terminated_length": 469.0, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.07360265633646929, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0075411254074424505, "learning_rate": 7.336405529953917e-06, "loss": 0.0003, "num_tokens": 3575163.0, "reward": 1.3181818723678589, "reward_std": 0.5454545617103577, "rewards/fixed_code_pass_all_test_reward/mean": 0.4431818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.31280240416526794, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.07378712414683637, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.00868042593356222, "learning_rate": 7.35483870967742e-06, "loss": 0.0003, "num_tokens": 3580920.0, "reward": 1.3571429252624512, "reward_std": 0.41121309995651245, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.41121309995651245, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.07397159195720347, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.006499154143966734, "learning_rate": 7.373271889400923e-06, "loss": 0.0003, "num_tokens": 3587620.0, "reward": 1.329545497894287, "reward_std": 0.25684255361557007, "rewards/fixed_code_pass_all_test_reward/mean": 0.3295454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.25684261322021484, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.07415605976757056, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.007306609419174492, "learning_rate": 7.3917050691244244e-06, "loss": 0.0003, "num_tokens": 3594606.0, "reward": 1.1712963581085205, "reward_std": 0.1802925169467926, "rewards/fixed_code_pass_all_test_reward/mean": 0.17129631340503693, "rewards/fixed_code_pass_all_test_reward/std": 0.1802925169467926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.07434052757793765, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.0058782399573829025, "learning_rate": 7.410138248847927e-06, "loss": 0.0002, "num_tokens": 3599061.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 525.375, "completions/mean_terminated_length": 525.375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.07452499538830475, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.0017721121330396272, "learning_rate": 7.428571428571429e-06, "loss": 0.0001, "num_tokens": 3607536.0, "reward": 1.7083333730697632, "reward_std": 0.41547447443008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.30860668420791626, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07470946319867183, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.003414766521018464, "learning_rate": 7.447004608294931e-06, "loss": 0.0001, "num_tokens": 3613674.0, "reward": 1.890625, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.07489393100903892, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.007081861549522728, "learning_rate": 7.465437788018434e-06, "loss": 0.0003, "num_tokens": 3617417.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 454.75, "completions/mean_terminated_length": 454.75, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.07507839881940602, "frac_reward_zero_std": 1.0, "grad_norm": 0.038818359375, "kl": 0.004671683054766618, "learning_rate": 7.483870967741936e-06, "loss": 0.0002, "num_tokens": 3625655.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 567.625, "completions/mean_terminated_length": 567.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.0752628666297731, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.003710546501679346, "learning_rate": 7.502304147465438e-06, "loss": 0.0001, "num_tokens": 3637236.0, "reward": 1.0208333730697632, "reward_std": 0.024622410535812378, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.024622369557619095, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.07544733444014019, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.011776236293371767, "learning_rate": 7.52073732718894e-06, "loss": 0.0005, "num_tokens": 3643655.0, "reward": 1.3421052694320679, "reward_std": 0.47908368706703186, "rewards/fixed_code_pass_all_test_reward/mean": 0.34210526943206787, "rewards/fixed_code_pass_all_test_reward/std": 0.47908368706703186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.07563180225050728, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.010540796763962135, "learning_rate": 7.5391705069124435e-06, "loss": 0.0004, "num_tokens": 3651035.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.07581627006087438, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.008632576616946608, "learning_rate": 7.557603686635945e-06, "loss": 0.0003, "num_tokens": 3661590.0, "reward": 1.1071429252624512, "reward_std": 0.16642357409000397, "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428582072258, "rewards/fixed_code_pass_all_test_reward/std": 0.16642354428768158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 445.5, "completions/mean_terminated_length": 445.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.07600073787124147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.008751891989959404, "learning_rate": 7.576036866359447e-06, "loss": 0.0004, "num_tokens": 3673506.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.07618520568160855, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.002681053098058328, "learning_rate": 7.59447004608295e-06, "loss": 0.0001, "num_tokens": 3679602.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.07636967349197565, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.007404296571621671, "learning_rate": 7.612903225806451e-06, "loss": 0.0003, "num_tokens": 3688879.0, "reward": 1.6607143878936768, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3535534143447876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.07655414130234274, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.017222604888956994, "learning_rate": 7.631336405529954e-06, "loss": 0.0007, "num_tokens": 3695700.0, "reward": 0.9510869979858398, "reward_std": 0.672534167766571, "rewards/fixed_code_pass_all_test_reward/mean": 0.20108693838119507, "rewards/fixed_code_pass_all_test_reward/std": 0.3508698642253876, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07673860911270983, "frac_reward_zero_std": 1.0, "grad_norm": 0.053955078125, "kl": 0.00511009362526238, "learning_rate": 7.649769585253457e-06, "loss": 0.0002, "num_tokens": 3699918.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.07692307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.010440759273478761, "learning_rate": 7.66820276497696e-06, "loss": 0.0004, "num_tokens": 3709959.0, "reward": 1.3510100841522217, "reward_std": 0.7012073993682861, "rewards/fixed_code_pass_all_test_reward/mean": 0.4760100841522217, "rewards/fixed_code_pass_all_test_reward/std": 0.4803008735179901, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.07710754473344401, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.014838128234259784, "learning_rate": 7.686635944700462e-06, "loss": 0.0006, "num_tokens": 3718239.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 749.875, "completions/mean_terminated_length": 749.875, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.0772920125438111, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.004735645023174584, "learning_rate": 7.705069124423963e-06, "loss": 0.0002, "num_tokens": 3735702.0, "reward": 1.3125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0774764803541782, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.007138397195376456, "learning_rate": 7.723502304147466e-06, "loss": 0.0003, "num_tokens": 3744312.0, "reward": 1.1994680166244507, "reward_std": 0.35327890515327454, "rewards/fixed_code_pass_all_test_reward/mean": 0.19946807622909546, "rewards/fixed_code_pass_all_test_reward/std": 0.35327890515327454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 392.625, "completions/mean_terminated_length": 392.625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.07766094816454529, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.0062046901730354875, "learning_rate": 7.741935483870968e-06, "loss": 0.0002, "num_tokens": 3752405.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.07784541597491237, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.005573209258727729, "learning_rate": 7.760368663594471e-06, "loss": 0.0002, "num_tokens": 3757043.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 445.125, "completions/mean_terminated_length": 445.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.07802988378527947, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.005312072375090793, "learning_rate": 7.778801843317973e-06, "loss": 0.0002, "num_tokens": 3770788.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 411.75, "completions/mean_terminated_length": 411.75, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.07821435159564656, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.004795217857463285, "learning_rate": 7.797235023041474e-06, "loss": 0.0002, "num_tokens": 3780842.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.07839881940601365, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.009189833799609914, "learning_rate": 7.815668202764978e-06, "loss": 0.0004, "num_tokens": 3785235.0, "reward": 0.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.07858328721638075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0274658203125, "kl": 0.003783193475101143, "learning_rate": 7.83410138248848e-06, "loss": 0.0002, "num_tokens": 3789846.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.07876775502674783, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.006670542468782514, "learning_rate": 7.852534562211982e-06, "loss": 0.0003, "num_tokens": 3799167.0, "reward": 1.18478262424469, "reward_std": 0.5859780311584473, "rewards/fixed_code_pass_all_test_reward/mean": 0.30978259444236755, "rewards/fixed_code_pass_all_test_reward/std": 0.3603622019290924, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.07895222283711492, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.010415720200398937, "learning_rate": 7.870967741935484e-06, "loss": 0.0004, "num_tokens": 3805229.0, "reward": 1.0719339847564697, "reward_std": 0.2034599632024765, "rewards/fixed_code_pass_all_test_reward/mean": 0.07193396240472794, "rewards/fixed_code_pass_all_test_reward/std": 0.2034599632024765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 88.0, "completions/mean_terminated_length": 88.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.07913669064748201, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.007869254244724289, "learning_rate": 7.889400921658987e-06, "loss": 0.0003, "num_tokens": 3808717.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.07932115845784911, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.00551656776224263, "learning_rate": 7.907834101382489e-06, "loss": 0.0002, "num_tokens": 3812997.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 385.875, "completions/mean_terminated_length": 148.42857360839844, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.0795056262682162, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.010437879238452297, "learning_rate": 7.926267281105992e-06, "loss": 0.0004, "num_tokens": 3818836.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.07969009407858328, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0076426854357123375, "learning_rate": 7.944700460829495e-06, "loss": 0.0003, "num_tokens": 3827026.0, "reward": 1.4898648262023926, "reward_std": 0.2673344314098358, "rewards/fixed_code_pass_all_test_reward/mean": 0.48986485600471497, "rewards/fixed_code_pass_all_test_reward/std": 0.2673344612121582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 659.125, "completions/mean_terminated_length": 659.125, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.07987456188895038, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.003179049410391599, "learning_rate": 7.963133640552997e-06, "loss": 0.0001, "num_tokens": 3839435.0, "reward": 1.1583333015441895, "reward_std": 0.345377653837204, "rewards/fixed_code_pass_all_test_reward/mean": 0.15833333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.345377653837204, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08005902969931747, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.008061024942435324, "learning_rate": 7.981566820276498e-06, "loss": 0.0003, "num_tokens": 3849790.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.08024349750968456, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541015625, "kl": 0.006641245912760496, "learning_rate": 8.000000000000001e-06, "loss": 0.0003, "num_tokens": 3857536.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 250.25, "completions/mean_terminated_length": 250.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.08042796532005166, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.01739987311884761, "learning_rate": 8.018433179723503e-06, "loss": 0.0007, "num_tokens": 3865474.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1413.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 496.25, "completions/mean_terminated_length": 496.25, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.08061243313041874, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0060167695992277, "learning_rate": 8.036866359447006e-06, "loss": 0.0002, "num_tokens": 3877508.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.08079690094078583, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.007288697815965861, "learning_rate": 8.055299539170508e-06, "loss": 0.0003, "num_tokens": 3882022.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 463.125, "completions/mean_terminated_length": 463.125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.08098136875115293, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.005428747681435198, "learning_rate": 8.07373271889401e-06, "loss": 0.0002, "num_tokens": 3891407.0, "reward": 1.2291666269302368, "reward_std": 0.426665723323822, "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.4266657531261444, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.08116583656152002, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.007323861646000296, "learning_rate": 8.092165898617512e-06, "loss": 0.0003, "num_tokens": 3897366.0, "reward": 1.6477272510528564, "reward_std": 0.48699134588241577, "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.48699134588241577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 438.875, "completions/mean_terminated_length": 209.00001525878906, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0813503043718871, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.007362302916590124, "learning_rate": 8.110599078341016e-06, "loss": 0.0003, "num_tokens": 3907069.0, "reward": 1.128787875175476, "reward_std": 0.6401147246360779, "rewards/fixed_code_pass_all_test_reward/mean": 0.2537878751754761, "rewards/fixed_code_pass_all_test_reward/std": 0.46069079637527466, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 306.625, "completions/mean_terminated_length": 306.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0815347721822542, "frac_reward_zero_std": 0.0, "grad_norm": 43.75, "kl": 0.08235446570324712, "learning_rate": 8.129032258064517e-06, "loss": 0.0033, "num_tokens": 3917058.0, "reward": 1.6062500476837158, "reward_std": 0.6538007259368896, "rewards/fixed_code_pass_all_test_reward/mean": 0.731249988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.3058215081691742, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.08171923999262129, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.006973760697292164, "learning_rate": 8.147465437788019e-06, "loss": 0.0003, "num_tokens": 3920927.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.08190370780298838, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.012409621442202479, "learning_rate": 8.165898617511522e-06, "loss": 0.0005, "num_tokens": 3928426.0, "reward": 1.4331395626068115, "reward_std": 0.5681463479995728, "rewards/fixed_code_pass_all_test_reward/mean": 0.6831395626068115, "rewards/fixed_code_pass_all_test_reward/std": 0.19255134463310242, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08208817561335546, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.007150022676796652, "learning_rate": 8.184331797235023e-06, "loss": 0.0003, "num_tokens": 3937975.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 1146.0, "completions/mean_terminated_length": 1146.0, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.08227264342372256, "frac_reward_zero_std": 0.0, "grad_norm": 0.5625, "kl": 0.0022964466115809046, "learning_rate": 8.202764976958527e-06, "loss": 0.0001, "num_tokens": 3960519.0, "reward": 0.9895833730697632, "reward_std": 0.5126137733459473, "rewards/fixed_code_pass_all_test_reward/mean": 0.1145833358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.08245711123408965, "frac_reward_zero_std": 1.0, "grad_norm": 0.05517578125, "kl": 0.007960955641465262, "learning_rate": 8.221198156682028e-06, "loss": 0.0003, "num_tokens": 3969577.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.08264157904445674, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.009985378885176033, "learning_rate": 8.23963133640553e-06, "loss": 0.0004, "num_tokens": 3980686.0, "reward": 1.6687500476837158, "reward_std": 0.7116066217422485, "rewards/fixed_code_pass_all_test_reward/mean": 0.793749988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.39318978786468506, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 756.125, "completions/mean_terminated_length": 571.5714721679688, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.08282604685482384, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.0033089515927713364, "learning_rate": 8.258064516129033e-06, "loss": 0.0001, "num_tokens": 3992559.0, "reward": 1.30978262424469, "reward_std": 0.6864442825317383, "rewards/fixed_code_pass_all_test_reward/mean": 0.43478259444236755, "rewards/fixed_code_pass_all_test_reward/std": 0.47114986181259155, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.08301051466519092, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.004083815962076187, "learning_rate": 8.276497695852536e-06, "loss": 0.0002, "num_tokens": 3998052.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.08319498247555801, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.009311292902566493, "learning_rate": 8.294930875576038e-06, "loss": 0.0004, "num_tokens": 4008925.0, "reward": 1.2407407760620117, "reward_std": 0.19598153233528137, "rewards/fixed_code_pass_all_test_reward/mean": 0.24074074625968933, "rewards/fixed_code_pass_all_test_reward/std": 0.19598159193992615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.08337945028592511, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.007063408003887162, "learning_rate": 8.31336405529954e-06, "loss": 0.0003, "num_tokens": 4016800.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.0835639180962922, "frac_reward_zero_std": 1.0, "grad_norm": 0.03515625, "kl": 0.003827101696515456, "learning_rate": 8.331797235023043e-06, "loss": 0.0002, "num_tokens": 4023041.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.08374838590665928, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.01117328746477142, "learning_rate": 8.350230414746544e-06, "loss": 0.0004, "num_tokens": 4027377.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08393285371702638, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.012280484283110127, "learning_rate": 8.368663594470047e-06, "loss": 0.0005, "num_tokens": 4034382.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.08411732152739347, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.007756367325782776, "learning_rate": 8.387096774193549e-06, "loss": 0.0003, "num_tokens": 4038463.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.08430178933776056, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0048126247129403055, "learning_rate": 8.40552995391705e-06, "loss": 0.0002, "num_tokens": 4043056.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.08448625714812766, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.009729965298902243, "learning_rate": 8.423963133640554e-06, "loss": 0.0004, "num_tokens": 4051839.0, "reward": 1.394230842590332, "reward_std": 0.24476775527000427, "rewards/fixed_code_pass_all_test_reward/mean": 0.39423078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.24476775527000427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.08467072495849474, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.01290415384573862, "learning_rate": 8.442396313364057e-06, "loss": 0.0005, "num_tokens": 4061352.0, "reward": 1.4895833730697632, "reward_std": 0.29693371057510376, "rewards/fixed_code_pass_all_test_reward/mean": 0.4895833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 365.5, "completions/mean_terminated_length": 365.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.08485519276886183, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.023275704821571708, "learning_rate": 8.460829493087558e-06, "loss": 0.0009, "num_tokens": 4069012.0, "reward": 0.9874999523162842, "reward_std": 0.34511902928352356, "rewards/fixed_code_pass_all_test_reward/mean": 0.11250000447034836, "rewards/fixed_code_pass_all_test_reward/std": 0.06943651288747787, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.08503966057922892, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.007288396620424464, "learning_rate": 8.47926267281106e-06, "loss": 0.0003, "num_tokens": 4073128.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.08522412838959602, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.009763230045791715, "learning_rate": 8.497695852534563e-06, "loss": 0.0004, "num_tokens": 4080876.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 429.125, "completions/mean_terminated_length": 429.125, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.0854085961999631, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.004037137754494324, "learning_rate": 8.516129032258065e-06, "loss": 0.0002, "num_tokens": 4089293.0, "reward": 1.5178570747375488, "reward_std": 0.6169180870056152, "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.26816877722740173, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.08559306401033019, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.00536981294862926, "learning_rate": 8.534562211981568e-06, "loss": 0.0002, "num_tokens": 4097988.0, "reward": 1.625, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.08577753182069729, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.006526368291815743, "learning_rate": 8.55299539170507e-06, "loss": 0.0003, "num_tokens": 4105154.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 351.75, "completions/mean_terminated_length": 351.75, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.08596199963106438, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.006330050469841808, "learning_rate": 8.571428571428571e-06, "loss": 0.0003, "num_tokens": 4112616.0, "reward": 1.125, "reward_std": 0.23570221662521362, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022613286972, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.08614646744143147, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.014002199633978307, "learning_rate": 8.589861751152074e-06, "loss": 0.0006, "num_tokens": 4121177.0, "reward": 1.1354167461395264, "reward_std": 0.35055938363075256, "rewards/fixed_code_pass_all_test_reward/mean": 0.1354166716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.3505593538284302, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.08633093525179857, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.014031477738171816, "learning_rate": 8.608294930875577e-06, "loss": 0.0006, "num_tokens": 4127943.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 233.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.08651540306216565, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.013643107493408024, "learning_rate": 8.626728110599079e-06, "loss": 0.0005, "num_tokens": 4137448.0, "reward": 1.1428570747375488, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.08669987087253274, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.009878469543764368, "learning_rate": 8.64516129032258e-06, "loss": 0.0004, "num_tokens": 4147113.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 492.25, "completions/mean_terminated_length": 492.25, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.08688433868289984, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.005292469693813473, "learning_rate": 8.663594470046084e-06, "loss": 0.0002, "num_tokens": 4160651.0, "reward": 1.9027777910232544, "reward_std": 0.2749859392642975, "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 423.875, "completions/mean_terminated_length": 423.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.08706880649326693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.008746038933168165, "learning_rate": 8.682027649769585e-06, "loss": 0.0003, "num_tokens": 4173666.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 474.125, "completions/mean_terminated_length": 474.125, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.08725327430363401, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.0050069118733517826, "learning_rate": 8.700460829493088e-06, "loss": 0.0002, "num_tokens": 4184979.0, "reward": 1.774999976158142, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.08743774211400111, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.01655578170903027, "learning_rate": 8.71889400921659e-06, "loss": 0.0007, "num_tokens": 4195329.0, "reward": 1.1590909957885742, "reward_std": 0.06428244709968567, "rewards/fixed_code_pass_all_test_reward/mean": 0.15909090638160706, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.0876222099243682, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.012738831341266632, "learning_rate": 8.737327188940093e-06, "loss": 0.0005, "num_tokens": 4203946.0, "reward": 1.0299999713897705, "reward_std": 0.08485280722379684, "rewards/fixed_code_pass_all_test_reward/mean": 0.029999999329447746, "rewards/fixed_code_pass_all_test_reward/std": 0.08485281467437744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 396.5, "completions/mean_terminated_length": 396.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.08780667773473529, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.01002240157686174, "learning_rate": 8.755760368663595e-06, "loss": 0.0004, "num_tokens": 4217630.0, "reward": 1.0184426307678223, "reward_std": 0.052163634449243546, "rewards/fixed_code_pass_all_test_reward/mean": 0.01844262331724167, "rewards/fixed_code_pass_all_test_reward/std": 0.052163612097501755, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.08799114554510237, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.007251915172673762, "learning_rate": 8.774193548387098e-06, "loss": 0.0003, "num_tokens": 4221741.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.08817561335546947, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0037967060925439, "learning_rate": 8.7926267281106e-06, "loss": 0.0002, "num_tokens": 4225964.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 574.625, "completions/mean_terminated_length": 364.14288330078125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.08836008116583656, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.007972997380420566, "learning_rate": 8.811059907834103e-06, "loss": 0.0003, "num_tokens": 4235209.0, "reward": 1.1530611515045166, "reward_std": 0.5718449354171753, "rewards/fixed_code_pass_all_test_reward/mean": 0.2780612111091614, "rewards/fixed_code_pass_all_test_reward/std": 0.3500864803791046, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.08854454897620365, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.006501436437247321, "learning_rate": 8.829493087557604e-06, "loss": 0.0003, "num_tokens": 4240387.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.08872901678657075, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.016169553971849382, "learning_rate": 8.847926267281107e-06, "loss": 0.0006, "num_tokens": 4246950.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.08891348459693783, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.013593474985100329, "learning_rate": 8.866359447004609e-06, "loss": 0.0005, "num_tokens": 4257158.0, "reward": 1.351063847541809, "reward_std": 0.41131263971328735, "rewards/fixed_code_pass_all_test_reward/mean": 0.4760638177394867, "rewards/fixed_code_pass_all_test_reward/std": 0.32979950308799744, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.08909795240730492, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.018375163315795362, "learning_rate": 8.884792626728112e-06, "loss": 0.0007, "num_tokens": 4268570.0, "reward": 1.389423131942749, "reward_std": 0.37030377984046936, "rewards/fixed_code_pass_all_test_reward/mean": 0.38942307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.37030377984046936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.08928242021767202, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.011039349192287773, "learning_rate": 8.903225806451614e-06, "loss": 0.0004, "num_tokens": 4275039.0, "reward": 1.663461446762085, "reward_std": 0.13598209619522095, "rewards/fixed_code_pass_all_test_reward/mean": 0.6634615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.13598206639289856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.08946688802803911, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.009812530159251764, "learning_rate": 8.921658986175115e-06, "loss": 0.0004, "num_tokens": 4283397.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 478.5, "completions/mean_terminated_length": 478.5, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.0896513558384062, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.008848927856888622, "learning_rate": 8.940092165898619e-06, "loss": 0.0004, "num_tokens": 4295729.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0898358236487733, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.023507019854150712, "learning_rate": 8.958525345622122e-06, "loss": 0.0009, "num_tokens": 4303727.0, "reward": 1.7625000476837158, "reward_std": 0.219983771443367, "rewards/fixed_code_pass_all_test_reward/mean": 0.7625000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.2199837565422058, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.09002029145914038, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.015778631321154535, "learning_rate": 8.976958525345623e-06, "loss": 0.0006, "num_tokens": 4312524.0, "reward": 1.6931817531585693, "reward_std": 0.42414578795433044, "rewards/fixed_code_pass_all_test_reward/mean": 0.6931818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.42414578795433044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 653.0, "completions/mean_terminated_length": 653.0, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.09020475926950747, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.008133421768434346, "learning_rate": 8.995391705069125e-06, "loss": 0.0003, "num_tokens": 4328804.0, "reward": 0.9107142686843872, "reward_std": 0.506157398223877, "rewards/fixed_code_pass_all_test_reward/mean": 0.1607142835855484, "rewards/fixed_code_pass_all_test_reward/std": 0.17806050181388855, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.09038922707987457, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.013259486702736467, "learning_rate": 9.013824884792628e-06, "loss": 0.0005, "num_tokens": 4333228.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.09057369489024165, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02614689970505424, "learning_rate": 9.03225806451613e-06, "loss": 0.001, "num_tokens": 4339495.0, "reward": 1.4666666984558105, "reward_std": 0.1885617971420288, "rewards/fixed_code_pass_all_test_reward/mean": 0.46666669845581055, "rewards/fixed_code_pass_all_test_reward/std": 0.1885618269443512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.09075816270060874, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.021704821148887277, "learning_rate": 9.050691244239633e-06, "loss": 0.0009, "num_tokens": 4349889.0, "reward": 1.3125, "reward_std": 0.2912411689758301, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.29124119877815247, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.09094263051097584, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.01827052456792444, "learning_rate": 9.069124423963134e-06, "loss": 0.0007, "num_tokens": 4358312.0, "reward": 1.6454325914382935, "reward_std": 0.40258297324180603, "rewards/fixed_code_pass_all_test_reward/mean": 0.770432710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.3204118013381958, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 769.625, "completions/mean_terminated_length": 769.625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.09112709832134293, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.006645219080382958, "learning_rate": 9.087557603686636e-06, "loss": 0.0003, "num_tokens": 4376293.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 403.625, "completions/mean_terminated_length": 403.625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.09131156613171001, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.020066366530954838, "learning_rate": 9.105990783410139e-06, "loss": 0.0008, "num_tokens": 4387354.0, "reward": 1.0499999523162842, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.05000000074505806, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 638.875, "completions/mean_terminated_length": 638.875, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "epoch": 0.0914960339420771, "frac_reward_zero_std": 0.0, "grad_norm": 0.6171875, "kl": 0.006086960493121296, "learning_rate": 9.124423963133642e-06, "loss": 0.0002, "num_tokens": 4399681.0, "reward": 1.3161765336990356, "reward_std": 0.12453576177358627, "rewards/fixed_code_pass_all_test_reward/mean": 0.31617647409439087, "rewards/fixed_code_pass_all_test_reward/std": 0.12453572452068329, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.0916805017524442, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.009968580154236406, "learning_rate": 9.142857142857144e-06, "loss": 0.0004, "num_tokens": 4408686.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.09186496956281129, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.017985473736189306, "learning_rate": 9.161290322580645e-06, "loss": 0.0007, "num_tokens": 4412284.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.09204943737317837, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.00770829024259001, "learning_rate": 9.179723502304149e-06, "loss": 0.0003, "num_tokens": 4417731.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.09223390518354548, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.016197526594623923, "learning_rate": 9.19815668202765e-06, "loss": 0.0006, "num_tokens": 4425468.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.09241837299391256, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.010029495519120246, "learning_rate": 9.216589861751153e-06, "loss": 0.0004, "num_tokens": 4430294.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.09260284080427965, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.00859968806616962, "learning_rate": 9.235023041474655e-06, "loss": 0.0003, "num_tokens": 4438401.0, "reward": 1.7708333730697632, "reward_std": 0.27706217765808105, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.27706217765808105, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.09278730861464675, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.015643441933207214, "learning_rate": 9.253456221198156e-06, "loss": 0.0006, "num_tokens": 4444701.0, "reward": 1.4202585220336914, "reward_std": 0.27477267384529114, "rewards/fixed_code_pass_all_test_reward/mean": 0.42025864124298096, "rewards/fixed_code_pass_all_test_reward/std": 0.2747727334499359, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.09297177642501384, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.009262629377190024, "learning_rate": 9.27188940092166e-06, "loss": 0.0004, "num_tokens": 4451732.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 233.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.09315624423538092, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.015469123609364033, "learning_rate": 9.290322580645163e-06, "loss": 0.0006, "num_tokens": 4460181.0, "reward": 1.043269157409668, "reward_std": 0.059717193245887756, "rewards/fixed_code_pass_all_test_reward/mean": 0.04326923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.05971721187233925, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.09334071204574802, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.029780214419588447, "learning_rate": 9.308755760368664e-06, "loss": 0.0012, "num_tokens": 4464003.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.09352517985611511, "frac_reward_zero_std": 1.0, "grad_norm": 0.034423828125, "kl": 0.006179669551784173, "learning_rate": 9.327188940092166e-06, "loss": 0.0002, "num_tokens": 4469817.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.0937096476664822, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.012020637281239033, "learning_rate": 9.34562211981567e-06, "loss": 0.0005, "num_tokens": 4478661.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0938941154768493, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.012052517209667712, "learning_rate": 9.36405529953917e-06, "loss": 0.0005, "num_tokens": 4483110.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 386.125, "completions/mean_terminated_length": 386.125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.09407858328721638, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.006277043285081163, "learning_rate": 9.382488479262674e-06, "loss": 0.0003, "num_tokens": 4491287.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 421.625, "completions/mean_terminated_length": 421.625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.09426305109758347, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.01542604630230926, "learning_rate": 9.400921658986176e-06, "loss": 0.0006, "num_tokens": 4502228.0, "reward": 1.8406250476837158, "reward_std": 0.2146165668964386, "rewards/fixed_code_pass_all_test_reward/mean": 0.840624988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.2146165817975998, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.09444751890795056, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.022664624731987715, "learning_rate": 9.419354838709677e-06, "loss": 0.0009, "num_tokens": 4510765.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.09463198671831766, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.013284680084325373, "learning_rate": 9.43778801843318e-06, "loss": 0.0005, "num_tokens": 4519541.0, "reward": 1.5735294818878174, "reward_std": 0.42037805914878845, "rewards/fixed_code_pass_all_test_reward/mean": 0.5735294222831726, "rewards/fixed_code_pass_all_test_reward/std": 0.4203781187534332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.09481645452868474, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.019034119206480682, "learning_rate": 9.456221198156684e-06, "loss": 0.0008, "num_tokens": 4525134.0, "reward": 1.6759867668151855, "reward_std": 0.27356648445129395, "rewards/fixed_code_pass_all_test_reward/mean": 0.6759868264198303, "rewards/fixed_code_pass_all_test_reward/std": 0.27356648445129395, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.09500092233905183, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.010230286192381755, "learning_rate": 9.474654377880185e-06, "loss": 0.0004, "num_tokens": 4530347.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.09518539014941893, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.019840551540255547, "learning_rate": 9.493087557603687e-06, "loss": 0.0008, "num_tokens": 4534361.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 693.875, "completions/mean_terminated_length": 693.875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.09536985795978602, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.008218316215788946, "learning_rate": 9.51152073732719e-06, "loss": 0.0003, "num_tokens": 4549136.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 465.75, "completions/mean_terminated_length": 465.75, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.0955543257701531, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.012049072829540819, "learning_rate": 9.529953917050691e-06, "loss": 0.0005, "num_tokens": 4558054.0, "reward": 1.4642857313156128, "reward_std": 0.44361352920532227, "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.44361358880996704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 427.125, "completions/mean_terminated_length": 427.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.0957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.00742917726165615, "learning_rate": 9.548387096774195e-06, "loss": 0.0003, "num_tokens": 4568471.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.09592326139088729, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.01198087417287752, "learning_rate": 9.566820276497696e-06, "loss": 0.0005, "num_tokens": 4576826.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 548.375, "completions/mean_terminated_length": 548.375, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.09610772920125438, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.004077940568095073, "learning_rate": 9.5852534562212e-06, "loss": 0.0002, "num_tokens": 4589037.0, "reward": 1.784999966621399, "reward_std": 0.40507495403289795, "rewards/fixed_code_pass_all_test_reward/mean": 0.7849999666213989, "rewards/fixed_code_pass_all_test_reward/std": 0.40507495403289795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.09629219701162148, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.005467346258228645, "learning_rate": 9.603686635944701e-06, "loss": 0.0002, "num_tokens": 4599037.0, "reward": 1.7801203727722168, "reward_std": 0.06134949252009392, "rewards/fixed_code_pass_all_test_reward/mean": 0.7801204919815063, "rewards/fixed_code_pass_all_test_reward/std": 0.0613495409488678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 439.375, "completions/mean_terminated_length": 439.375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.09647666482198856, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.021775180648546666, "learning_rate": 9.622119815668204e-06, "loss": 0.0009, "num_tokens": 4611288.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.09666113263235565, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.015284803317626938, "learning_rate": 9.640552995391706e-06, "loss": 0.0006, "num_tokens": 4617878.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.09684560044272275, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.004679077537730336, "learning_rate": 9.658986175115209e-06, "loss": 0.0002, "num_tokens": 4626841.0, "reward": 1.8499999046325684, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 579.875, "completions/mean_terminated_length": 579.875, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.09703006825308984, "frac_reward_zero_std": 0.0, "grad_norm": 0.65625, "kl": 0.003018162358785048, "learning_rate": 9.67741935483871e-06, "loss": 0.0001, "num_tokens": 4640264.0, "reward": 1.774999976158142, "reward_std": 0.15507294237613678, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.15507294237613678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.09721453606345692, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.006314793601632118, "learning_rate": 9.695852534562212e-06, "loss": 0.0003, "num_tokens": 4649088.0, "reward": 1.8333332538604736, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 968.625, "completions/mean_terminated_length": 968.625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.09739900387382401, "frac_reward_zero_std": 0.0, "grad_norm": 0.6484375, "kl": 0.0034916122094728053, "learning_rate": 9.714285714285715e-06, "loss": 0.0001, "num_tokens": 4663909.0, "reward": 1.396484375, "reward_std": 0.7335879802703857, "rewards/fixed_code_pass_all_test_reward/mean": 0.521484375, "rewards/fixed_code_pass_all_test_reward/std": 0.5139608979225159, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.09758347168419111, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.012409394374117255, "learning_rate": 9.732718894009218e-06, "loss": 0.0005, "num_tokens": 4672829.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.0977679394945582, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.014863356482237577, "learning_rate": 9.75115207373272e-06, "loss": 0.0006, "num_tokens": 4682753.0, "reward": 1.3894927501678467, "reward_std": 0.4045705795288086, "rewards/fixed_code_pass_all_test_reward/mean": 0.3894927501678467, "rewards/fixed_code_pass_all_test_reward/std": 0.4045705795288086, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.09795240730492528, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03738776163663715, "learning_rate": 9.769585253456221e-06, "loss": 0.0015, "num_tokens": 4692536.0, "reward": 1.274999976158142, "reward_std": 0.45276927947998047, "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.45276927947998047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.09813687511529239, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.023198360693641007, "learning_rate": 9.788018433179725e-06, "loss": 0.0009, "num_tokens": 4701891.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 147.375, "completions/mean_terminated_length": 147.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.09832134292565947, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.026852728566154838, "learning_rate": 9.806451612903226e-06, "loss": 0.0011, "num_tokens": 4705750.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.09850581073602656, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.008462803525617346, "learning_rate": 9.82488479262673e-06, "loss": 0.0003, "num_tokens": 4714313.0, "reward": 1.451923131942749, "reward_std": 0.3143174350261688, "rewards/fixed_code_pass_all_test_reward/mean": 0.5769230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.3076923191547394, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 405.375, "completions/mean_terminated_length": 405.375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.09869027854639366, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.00855463364860043, "learning_rate": 9.843317972350231e-06, "loss": 0.0003, "num_tokens": 4722204.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 372.625, "completions/mean_terminated_length": 372.625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.09887474635676075, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.024772272241534665, "learning_rate": 9.861751152073733e-06, "loss": 0.001, "num_tokens": 4731697.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.09905921416712783, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.015534053789451718, "learning_rate": 9.880184331797236e-06, "loss": 0.0006, "num_tokens": 4739891.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 528.625, "completions/mean_terminated_length": 528.625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.09924368197749493, "frac_reward_zero_std": 1.0, "grad_norm": 0.055908203125, "kl": 0.009997292479965836, "learning_rate": 9.898617511520739e-06, "loss": 0.0004, "num_tokens": 4753312.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.09942814978786202, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.011408571677748114, "learning_rate": 9.91705069124424e-06, "loss": 0.0005, "num_tokens": 4763083.0, "reward": 1.21875, "reward_std": 0.348590224981308, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.018483906984329224, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.0996126175982291, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.01086769945686683, "learning_rate": 9.935483870967742e-06, "loss": 0.0004, "num_tokens": 4768294.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.0997970854085962, "frac_reward_zero_std": 1.0, "grad_norm": 0.80078125, "kl": 0.06066339008975774, "learning_rate": 9.953917050691245e-06, "loss": 0.0024, "num_tokens": 4772362.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 381.375, "completions/mean_terminated_length": 381.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.09998155321896329, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.009474723774474114, "learning_rate": 9.972350230414747e-06, "loss": 0.0004, "num_tokens": 4783197.0, "reward": 1.329861044883728, "reward_std": 0.1272396594285965, "rewards/fixed_code_pass_all_test_reward/mean": 0.3298611044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.1272396296262741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.10016602102933038, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.018227670050691813, "learning_rate": 9.99078341013825e-06, "loss": 0.0007, "num_tokens": 4793279.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.10035048883969747, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.011913210444618016, "learning_rate": 1.0009216589861752e-05, "loss": 0.0005, "num_tokens": 4801563.0, "reward": 1.15625, "reward_std": 0.06681530922651291, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.10053495665006457, "frac_reward_zero_std": 1.0, "grad_norm": 0.224609375, "kl": 0.014751313021406531, "learning_rate": 1.0027649769585255e-05, "loss": 0.0006, "num_tokens": 4809440.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.10071942446043165, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.008262459567049518, "learning_rate": 1.0046082949308758e-05, "loss": 0.0003, "num_tokens": 4814430.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.10090389227079874, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.017391397792380303, "learning_rate": 1.0064516129032258e-05, "loss": 0.0007, "num_tokens": 4820131.0, "reward": 1.515625, "reward_std": 0.5194326043128967, "rewards/fixed_code_pass_all_test_reward/mean": 0.515625, "rewards/fixed_code_pass_all_test_reward/std": 0.5194326639175415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.10108836008116584, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.0258788182400167, "learning_rate": 1.0082949308755761e-05, "loss": 0.001, "num_tokens": 4830180.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.10127282789153293, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.013796435610856861, "learning_rate": 1.0101382488479263e-05, "loss": 0.0006, "num_tokens": 4838914.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.10145729570190001, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.004797032437636517, "learning_rate": 1.0119815668202766e-05, "loss": 0.0002, "num_tokens": 4844145.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.10164176351226711, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.02610538701992482, "learning_rate": 1.0138248847926269e-05, "loss": 0.001, "num_tokens": 4853927.0, "reward": 1.60546875, "reward_std": 0.4218956530094147, "rewards/fixed_code_pass_all_test_reward/mean": 0.60546875, "rewards/fixed_code_pass_all_test_reward/std": 0.42189568281173706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1018262313226342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0291748046875, "kl": 0.005364995478885248, "learning_rate": 1.015668202764977e-05, "loss": 0.0002, "num_tokens": 4858413.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.10201069913300129, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.027199636911973357, "learning_rate": 1.0175115207373272e-05, "loss": 0.0011, "num_tokens": 4866000.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.10219516694336839, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.015301419596653432, "learning_rate": 1.0193548387096774e-05, "loss": 0.0006, "num_tokens": 4874199.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.10237963475373547, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.032321638660505414, "learning_rate": 1.0211981566820277e-05, "loss": 0.0013, "num_tokens": 4880036.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 428.25, "completions/mean_terminated_length": 428.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.10256410256410256, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.009116288798395544, "learning_rate": 1.023041474654378e-05, "loss": 0.0004, "num_tokens": 4890758.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.10274857037446966, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.025524010183289647, "learning_rate": 1.0248847926267282e-05, "loss": 0.001, "num_tokens": 4894863.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.10293303818483675, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.027910651348065585, "learning_rate": 1.0267281105990785e-05, "loss": 0.0011, "num_tokens": 4902597.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.10311750599520383, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.02021293924190104, "learning_rate": 1.0285714285714285e-05, "loss": 0.0008, "num_tokens": 4908443.0, "reward": 1.2916666269302368, "reward_std": 0.03450329229235649, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0345032773911953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.10330197380557093, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.009626896004192531, "learning_rate": 1.0304147465437788e-05, "loss": 0.0004, "num_tokens": 4916147.0, "reward": 1.90625, "reward_std": 0.03788074478507042, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.03788072615861893, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 640.75, "completions/mean_terminated_length": 439.71429443359375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.10348644161593802, "frac_reward_zero_std": 0.0, "grad_norm": 0.55078125, "kl": 0.005345965866581537, "learning_rate": 1.0322580645161291e-05, "loss": 0.0002, "num_tokens": 4929937.0, "reward": 1.2833333015441895, "reward_std": 0.5185449719429016, "rewards/fixed_code_pass_all_test_reward/mean": 0.40833333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1649915874004364, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.10367090942630511, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.011743417038815096, "learning_rate": 1.0341013824884793e-05, "loss": 0.0005, "num_tokens": 4935042.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1038553772366722, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02816119126509875, "learning_rate": 1.0359447004608296e-05, "loss": 0.0011, "num_tokens": 4943764.0, "reward": 1.046875, "reward_std": 0.03477181866765022, "rewards/fixed_code_pass_all_test_reward/mean": 0.046875, "rewards/fixed_code_pass_all_test_reward/std": 0.034771788865327835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 859.375, "completions/mean_terminated_length": 859.375, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 0.1040398450470393, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.006007810065057129, "learning_rate": 1.03778801843318e-05, "loss": 0.0002, "num_tokens": 4961175.0, "reward": 1.0625, "reward_std": 0.9082404375076294, "rewards/fixed_code_pass_all_test_reward/mean": 0.4374999701976776, "rewards/fixed_code_pass_all_test_reward/std": 0.426665723323822, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.10422431285740638, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.002705598730244674, "learning_rate": 1.0396313364055299e-05, "loss": 0.0001, "num_tokens": 4967762.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 759.5, "completions/mean_terminated_length": 759.5, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.10440878066777347, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.007965603726916015, "learning_rate": 1.0414746543778802e-05, "loss": 0.0003, "num_tokens": 4984398.0, "reward": 1.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.10459324847814057, "frac_reward_zero_std": 1.0, "grad_norm": 0.031982421875, "kl": 0.003391963335161563, "learning_rate": 1.0433179723502306e-05, "loss": 0.0001, "num_tokens": 4990913.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.10477771628850766, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.011580035963561386, "learning_rate": 1.0451612903225807e-05, "loss": 0.0005, "num_tokens": 4995021.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.10496218409887474, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.01358648284804076, "learning_rate": 1.047004608294931e-05, "loss": 0.0005, "num_tokens": 5001659.0, "reward": 1.8242753744125366, "reward_std": 0.08177722245454788, "rewards/fixed_code_pass_all_test_reward/mean": 0.8242753744125366, "rewards/fixed_code_pass_all_test_reward/std": 0.08177726715803146, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.10514665190924184, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.03694136347621679, "learning_rate": 1.0488479262672814e-05, "loss": 0.0015, "num_tokens": 5010368.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.10533111971960893, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.012418287515174598, "learning_rate": 1.0506912442396313e-05, "loss": 0.0005, "num_tokens": 5014950.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.10551558752997602, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.008490026375511661, "learning_rate": 1.0525345622119817e-05, "loss": 0.0003, "num_tokens": 5019489.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 332.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.10570005534034312, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.021591264056041837, "learning_rate": 1.0543778801843318e-05, "loss": 0.0009, "num_tokens": 5031435.0, "reward": 1.0499999523162842, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.05000000074505806, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.1058845231507102, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.03303455375134945, "learning_rate": 1.0562211981566821e-05, "loss": 0.0013, "num_tokens": 5038716.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.10606899096107729, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.02922314265742898, "learning_rate": 1.0580645161290325e-05, "loss": 0.0012, "num_tokens": 5048540.0, "reward": 1.4583333730697632, "reward_std": 0.669061541557312, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.3949388563632965, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 104.375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.10625345877144439, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.023002833884675056, "learning_rate": 1.0599078341013826e-05, "loss": 0.0009, "num_tokens": 5052823.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 530.125, "completions/mean_terminated_length": 530.125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.10643792658181148, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.01133900583954528, "learning_rate": 1.061751152073733e-05, "loss": 0.0005, "num_tokens": 5067360.0, "reward": 0.26923078298568726, "reward_std": 0.49851855635643005, "rewards/fixed_code_pass_all_test_reward/mean": 0.01923076994717121, "rewards/fixed_code_pass_all_test_reward/std": 0.03560846671462059, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.10662239439217856, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.03943873860407621, "learning_rate": 1.063594470046083e-05, "loss": 0.0016, "num_tokens": 5074867.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.10680686220254565, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.024694595718756318, "learning_rate": 1.0654377880184332e-05, "loss": 0.001, "num_tokens": 5084270.0, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.10699133001291275, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.02643735590390861, "learning_rate": 1.0672811059907836e-05, "loss": 0.0011, "num_tokens": 5093064.0, "reward": 1.7060810327529907, "reward_std": 0.33397406339645386, "rewards/fixed_code_pass_all_test_reward/mean": 0.7060810327529907, "rewards/fixed_code_pass_all_test_reward/std": 0.33397406339645386, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.10717579782327984, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.021516600623726845, "learning_rate": 1.0691244239631337e-05, "loss": 0.0009, "num_tokens": 5102137.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.10736026563364692, "frac_reward_zero_std": 1.0, "grad_norm": 0.032470703125, "kl": 0.004570527788018808, "learning_rate": 1.070967741935484e-05, "loss": 0.0002, "num_tokens": 5108540.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.10754473344401402, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.022146267001517117, "learning_rate": 1.0728110599078344e-05, "loss": 0.0009, "num_tokens": 5115203.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 454.125, "completions/mean_terminated_length": 454.125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.10772920125438111, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.017787566874176264, "learning_rate": 1.0746543778801843e-05, "loss": 0.0007, "num_tokens": 5127564.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1079136690647482, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.029783689766190946, "learning_rate": 1.0764976958525347e-05, "loss": 0.0012, "num_tokens": 5133500.0, "reward": 0.9524999856948853, "reward_std": 0.29798126220703125, "rewards/fixed_code_pass_all_test_reward/mean": 0.07750000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.06713525950908661, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 304.625, "completions/mean_terminated_length": 304.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.1080981368751153, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.015450290171429515, "learning_rate": 1.0783410138248848e-05, "loss": 0.0006, "num_tokens": 5142377.0, "reward": 1.7265625, "reward_std": 0.4251280128955841, "rewards/fixed_code_pass_all_test_reward/mean": 0.7265625, "rewards/fixed_code_pass_all_test_reward/std": 0.4251280128955841, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.10828260468548238, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.029802936245687306, "learning_rate": 1.0801843317972351e-05, "loss": 0.0012, "num_tokens": 5152255.0, "reward": 1.8952702283859253, "reward_std": 0.19962267577648163, "rewards/fixed_code_pass_all_test_reward/mean": 0.8952702283859253, "rewards/fixed_code_pass_all_test_reward/std": 0.19962267577648163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.10846707249584947, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.02647989382967353, "learning_rate": 1.0820276497695855e-05, "loss": 0.0011, "num_tokens": 5161055.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 401.0, "completions/mean_terminated_length": 401.0, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.10865154030621657, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.01807076029945165, "learning_rate": 1.0838709677419356e-05, "loss": 0.0007, "num_tokens": 5169391.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.10883600811658366, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.006720381221384741, "learning_rate": 1.0857142857142858e-05, "loss": 0.0003, "num_tokens": 5174319.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 554.5, "completions/mean_terminated_length": 554.5, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.10902047592695074, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.007886221806984395, "learning_rate": 1.087557603686636e-05, "loss": 0.0003, "num_tokens": 5184907.0, "reward": 1.9444444179534912, "reward_std": 0.11878276616334915, "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.11878277361392975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.10920494373731784, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.029293954838067293, "learning_rate": 1.0894009216589863e-05, "loss": 0.0012, "num_tokens": 5190653.0, "reward": 1.05978262424469, "reward_std": 0.15228478610515594, "rewards/fixed_code_pass_all_test_reward/mean": 0.05978260561823845, "rewards/fixed_code_pass_all_test_reward/std": 0.15228478610515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 232.875, "completions/mean_terminated_length": 232.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.10938941154768493, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.024786977330222726, "learning_rate": 1.0912442396313366e-05, "loss": 0.001, "num_tokens": 5198900.0, "reward": 1.6875, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.10957387935805202, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.01650290295947343, "learning_rate": 1.0930875576036867e-05, "loss": 0.0007, "num_tokens": 5205091.0, "reward": 1.165000081062317, "reward_std": 0.04242644086480141, "rewards/fixed_code_pass_all_test_reward/mean": 0.16500000655651093, "rewards/fixed_code_pass_all_test_reward/std": 0.04242641106247902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.1097583471684191, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.03848157008178532, "learning_rate": 1.094930875576037e-05, "loss": 0.0015, "num_tokens": 5214942.0, "reward": 1.2687499523162842, "reward_std": 0.6099985837936401, "rewards/fixed_code_pass_all_test_reward/mean": 0.39375001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.3668763041496277, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1099428149787862, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.01824515702901408, "learning_rate": 1.096774193548387e-05, "loss": 0.0007, "num_tokens": 5223324.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.11012728278915329, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.02129334374330938, "learning_rate": 1.0986175115207374e-05, "loss": 0.0009, "num_tokens": 5227550.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.11031175059952038, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.033238760370295495, "learning_rate": 1.1004608294930877e-05, "loss": 0.0013, "num_tokens": 5231478.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.11049621840988748, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.028226650087162852, "learning_rate": 1.1023041474654378e-05, "loss": 0.0011, "num_tokens": 5237934.0, "reward": 1.438829779624939, "reward_std": 0.6352447271347046, "rewards/fixed_code_pass_all_test_reward/mean": 0.563829779624939, "rewards/fixed_code_pass_all_test_reward/std": 0.3426975905895233, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 518.0, "completions/mean_terminated_length": 518.0, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.11068068622025456, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.013978310453239828, "learning_rate": 1.1041474654377882e-05, "loss": 0.0006, "num_tokens": 5253166.0, "reward": 1.1607141494750977, "reward_std": 0.4734187722206116, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.3148418068885803, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.11086515403062165, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.024668479105457664, "learning_rate": 1.1059907834101385e-05, "loss": 0.001, "num_tokens": 5259232.0, "reward": 1.5384615659713745, "reward_std": 0.6014915108680725, "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.30423885583877563, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.11104962184098875, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.02973625552840531, "learning_rate": 1.1078341013824885e-05, "loss": 0.0012, "num_tokens": 5268149.0, "reward": 1.1287128925323486, "reward_std": 0.640127956867218, "rewards/fixed_code_pass_all_test_reward/mean": 0.25371289253234863, "rewards/fixed_code_pass_all_test_reward/std": 0.4607324004173279, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.11123408965135584, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.05151755781844258, "learning_rate": 1.1096774193548388e-05, "loss": 0.0021, "num_tokens": 5274778.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.11141855746172293, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.03009527293033898, "learning_rate": 1.111520737327189e-05, "loss": 0.0012, "num_tokens": 5284420.0, "reward": 1.6722973585128784, "reward_std": 0.34677374362945557, "rewards/fixed_code_pass_all_test_reward/mean": 0.6722972989082336, "rewards/fixed_code_pass_all_test_reward/std": 0.34677374362945557, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 403.25, "completions/mean_terminated_length": 403.25, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.11160302527209003, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.014786576968617737, "learning_rate": 1.1133640552995393e-05, "loss": 0.0006, "num_tokens": 5293350.0, "reward": 1.3229167461395264, "reward_std": 0.3225564956665039, "rewards/fixed_code_pass_all_test_reward/mean": 0.4479166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.36983880400657654, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 525.25, "completions/mean_terminated_length": 307.71429443359375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.11178749308245711, "frac_reward_zero_std": 0.0, "grad_norm": 0.49609375, "kl": 0.014758320088731125, "learning_rate": 1.1152073732718896e-05, "loss": 0.0006, "num_tokens": 5304880.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1119719608928242, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.03478221967816353, "learning_rate": 1.1170506912442397e-05, "loss": 0.0014, "num_tokens": 5310014.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 431.5, "completions/mean_terminated_length": 431.5, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.1121564287031913, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.018478322017472237, "learning_rate": 1.1188940092165899e-05, "loss": 0.0007, "num_tokens": 5323522.0, "reward": 1.8693182468414307, "reward_std": 0.3344077169895172, "rewards/fixed_code_pass_all_test_reward/mean": 0.8693181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.3344077169895172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.11234089651355839, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.033673537662252784, "learning_rate": 1.12073732718894e-05, "loss": 0.0013, "num_tokens": 5332121.0, "reward": 1.1875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.11252536432392547, "frac_reward_zero_std": 1.0, "grad_norm": 0.051513671875, "kl": 0.014329691766761243, "learning_rate": 1.1225806451612904e-05, "loss": 0.0006, "num_tokens": 5342309.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 298.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.11270983213429256, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.01646226894808933, "learning_rate": 1.1244239631336407e-05, "loss": 0.0007, "num_tokens": 5350120.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.11289429994465966, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.01793209568131715, "learning_rate": 1.1262672811059908e-05, "loss": 0.0007, "num_tokens": 5360471.0, "reward": 1.446969747543335, "reward_std": 0.2357022762298584, "rewards/fixed_code_pass_all_test_reward/mean": 0.4469697177410126, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 512.0, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.11307876775502675, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.020935556502081454, "learning_rate": 1.1281105990783412e-05, "loss": 0.0008, "num_tokens": 5374583.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.11326323556539383, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0180795278865844, "learning_rate": 1.1299539170506913e-05, "loss": 0.0007, "num_tokens": 5383073.0, "reward": 1.7674418687820435, "reward_std": 0.4313310980796814, "rewards/fixed_code_pass_all_test_reward/mean": 0.7674418687820435, "rewards/fixed_code_pass_all_test_reward/std": 0.4313310980796814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.11344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.01955062581691891, "learning_rate": 1.1317972350230415e-05, "loss": 0.0008, "num_tokens": 5389406.0, "reward": 1.1222527027130127, "reward_std": 0.7924602031707764, "rewards/fixed_code_pass_all_test_reward/mean": 0.37225276231765747, "rewards/fixed_code_pass_all_test_reward/std": 0.44832080602645874, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.11363217118612802, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.02083478239364922, "learning_rate": 1.1336405529953918e-05, "loss": 0.0008, "num_tokens": 5398158.0, "reward": 1.49609375, "reward_std": 0.4480356276035309, "rewards/fixed_code_pass_all_test_reward/mean": 0.49609375, "rewards/fixed_code_pass_all_test_reward/std": 0.4480356276035309, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.1138166389964951, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.015172100509516895, "learning_rate": 1.1354838709677421e-05, "loss": 0.0006, "num_tokens": 5407732.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 537.875, "completions/mean_terminated_length": 537.875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.1140011068068622, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.007468666532076895, "learning_rate": 1.1373271889400923e-05, "loss": 0.0003, "num_tokens": 5419843.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1141855746172293, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.018508557637687773, "learning_rate": 1.1391705069124426e-05, "loss": 0.0007, "num_tokens": 5429022.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 288.0, "completions/mean_terminated_length": 288.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.11437004242759638, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.03095754235982895, "learning_rate": 1.1410138248847926e-05, "loss": 0.0012, "num_tokens": 5440286.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.11455451023796348, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.019285981077700853, "learning_rate": 1.1428571428571429e-05, "loss": 0.0008, "num_tokens": 5444367.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.11473897804833057, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.0254968018271029, "learning_rate": 1.1447004608294932e-05, "loss": 0.001, "num_tokens": 5452980.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.11492344585869765, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.01032915327232331, "learning_rate": 1.1465437788018434e-05, "loss": 0.0004, "num_tokens": 5456862.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.11510791366906475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0478515625, "kl": 0.013131651096045971, "learning_rate": 1.1483870967741937e-05, "loss": 0.0005, "num_tokens": 5465573.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 680.375, "completions/mean_terminated_length": 680.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.11529238147943184, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.007369970437139273, "learning_rate": 1.150230414746544e-05, "loss": 0.0003, "num_tokens": 5482096.0, "reward": 1.189814805984497, "reward_std": 0.18298962712287903, "rewards/fixed_code_pass_all_test_reward/mean": 0.18981480598449707, "rewards/fixed_code_pass_all_test_reward/std": 0.18298962712287903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 493.875, "completions/mean_terminated_length": 493.875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.11547684928979893, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.008883859787601978, "learning_rate": 1.152073732718894e-05, "loss": 0.0004, "num_tokens": 5493063.0, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.11566131710016603, "frac_reward_zero_std": 1.0, "grad_norm": 0.1884765625, "kl": 0.03757879603654146, "learning_rate": 1.1539170506912443e-05, "loss": 0.0015, "num_tokens": 5501743.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 127.125, "completions/mean_terminated_length": 127.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.11584578491053311, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.027318429434671998, "learning_rate": 1.1557603686635945e-05, "loss": 0.0011, "num_tokens": 5505400.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 632.0, "completions/mean_terminated_length": 632.0, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.1160302527209002, "frac_reward_zero_std": 0.0, "grad_norm": 0.466796875, "kl": 0.006017070059897378, "learning_rate": 1.1576036866359448e-05, "loss": 0.0002, "num_tokens": 5517320.0, "reward": 1.1458332538604736, "reward_std": 0.058925557881593704, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.11621472053126729, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.01838120660977438, "learning_rate": 1.1594470046082951e-05, "loss": 0.0007, "num_tokens": 5521774.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 432.0, "completions/mean_terminated_length": 432.0, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.11639918834163439, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.00994636700488627, "learning_rate": 1.1612903225806453e-05, "loss": 0.0004, "num_tokens": 5529886.0, "reward": 1.798295497894287, "reward_std": 0.04108459874987602, "rewards/fixed_code_pass_all_test_reward/mean": 0.7982954382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.041084595024585724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 280.875, "completions/mean_terminated_length": 280.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.11658365615200147, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.021577075240202248, "learning_rate": 1.1631336405529954e-05, "loss": 0.0009, "num_tokens": 5535989.0, "reward": 1.6586538553237915, "reward_std": 0.47214797139167786, "rewards/fixed_code_pass_all_test_reward/mean": 0.6586538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.47214800119400024, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.11676812396236856, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.011569142865482718, "learning_rate": 1.1649769585253456e-05, "loss": 0.0005, "num_tokens": 5541010.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 471.25, "completions/mean_terminated_length": 471.25, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.11695259177273566, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.008614380727522075, "learning_rate": 1.1668202764976959e-05, "loss": 0.0003, "num_tokens": 5553012.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 395.5, "completions/mean_terminated_length": 395.5, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.11713705958310275, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.01758048264309764, "learning_rate": 1.1686635944700462e-05, "loss": 0.0007, "num_tokens": 5563400.0, "reward": 1.7437500953674316, "reward_std": 0.37553533911705017, "rewards/fixed_code_pass_all_test_reward/mean": 0.7437499761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.37553533911705017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 526.625, "completions/mean_terminated_length": 526.625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.11732152739346983, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.011207400704734027, "learning_rate": 1.1705069124423964e-05, "loss": 0.0004, "num_tokens": 5576717.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 598.375, "completions/mean_terminated_length": 598.375, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.11750599520383694, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.01113398966845125, "learning_rate": 1.1723502304147467e-05, "loss": 0.0004, "num_tokens": 5590880.0, "reward": 1.317307710647583, "reward_std": 0.3088918924331665, "rewards/fixed_code_pass_all_test_reward/mean": 0.317307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.3088918924331665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 374.375, "completions/mean_terminated_length": 374.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.11769046301420402, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.010011591337388381, "learning_rate": 1.1741935483870967e-05, "loss": 0.0004, "num_tokens": 5597931.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.11787493082457111, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.01768647402059287, "learning_rate": 1.176036866359447e-05, "loss": 0.0007, "num_tokens": 5609623.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.11805939863493821, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.009646464277466293, "learning_rate": 1.1778801843317973e-05, "loss": 0.0004, "num_tokens": 5616243.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1182438664453053, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.014067991636693478, "learning_rate": 1.1797235023041475e-05, "loss": 0.0006, "num_tokens": 5620859.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 328.125, "completions/mean_terminated_length": 328.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.11842833425567238, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.017923756560776383, "learning_rate": 1.1815668202764978e-05, "loss": 0.0007, "num_tokens": 5631436.0, "reward": 1.537500023841858, "reward_std": 0.36620640754699707, "rewards/fixed_code_pass_all_test_reward/mean": 0.5375000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.36620640754699707, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 596.75, "completions/mean_terminated_length": 596.75, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.11861280206603948, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.009247386828064919, "learning_rate": 1.1834101382488481e-05, "loss": 0.0004, "num_tokens": 5644466.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.11879726987640657, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.01566549949347973, "learning_rate": 1.1852534562211983e-05, "loss": 0.0006, "num_tokens": 5650387.0, "reward": 1.21875, "reward_std": 0.321014940738678, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.321014940738678, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.11898173768677366, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.01485956716351211, "learning_rate": 1.1870967741935484e-05, "loss": 0.0006, "num_tokens": 5657120.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.11916620549714074, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.012116140278521925, "learning_rate": 1.1889400921658986e-05, "loss": 0.0005, "num_tokens": 5660918.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.11935067330750784, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.01627954060677439, "learning_rate": 1.190783410138249e-05, "loss": 0.0007, "num_tokens": 5667259.0, "reward": 1.1956522464752197, "reward_std": 0.606918454170227, "rewards/fixed_code_pass_all_test_reward/mean": 0.32065218687057495, "rewards/fixed_code_pass_all_test_reward/std": 0.3895318806171417, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 495.0, "completions/mean_terminated_length": 495.0, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.11953514111787493, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.009384277684148401, "learning_rate": 1.1926267281105992e-05, "loss": 0.0004, "num_tokens": 5677059.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.11971960892824202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.014314852189272642, "learning_rate": 1.1944700460829494e-05, "loss": 0.0006, "num_tokens": 5681436.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 390.125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.11990407673860912, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.02489046868868172, "learning_rate": 1.1963133640552997e-05, "loss": 0.001, "num_tokens": 5692493.0, "reward": 1.2386363744735718, "reward_std": 0.6297194957733154, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.40945151448249817, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 474.25, "completions/mean_terminated_length": 474.25, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.1200885445489762, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.021713092806749046, "learning_rate": 1.1981566820276497e-05, "loss": 0.0009, "num_tokens": 5701591.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.12027301235934329, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.018083055620081723, "learning_rate": 1.2e-05, "loss": 0.0007, "num_tokens": 5708296.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 639.5, "completions/mean_terminated_length": 639.5, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.12045748016971039, "frac_reward_zero_std": 0.0, "grad_norm": 0.49609375, "kl": 0.005555703130085021, "learning_rate": 1.2018433179723504e-05, "loss": 0.0002, "num_tokens": 5718860.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.12064194798007748, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0211978608276695, "learning_rate": 1.2036866359447005e-05, "loss": 0.0008, "num_tokens": 5725828.0, "reward": 1.3624999523162842, "reward_std": 0.2825268805027008, "rewards/fixed_code_pass_all_test_reward/mean": 0.36250001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.2825268805027008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.12082641579044456, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.007720790046732873, "learning_rate": 1.2055299539170508e-05, "loss": 0.0003, "num_tokens": 5729762.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 489.125, "completions/mean_terminated_length": 489.125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.12101088360081166, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.020320967538282275, "learning_rate": 1.2073732718894012e-05, "loss": 0.0008, "num_tokens": 5741347.0, "reward": 1.15625, "reward_std": 0.2156454473733902, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.21564547717571259, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 286.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.12119535141117875, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.03287648502737284, "learning_rate": 1.2092165898617511e-05, "loss": 0.0013, "num_tokens": 5749689.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.12137981922154584, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.02426884847227484, "learning_rate": 1.2110599078341015e-05, "loss": 0.001, "num_tokens": 5755773.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 470.5, "completions/mean_terminated_length": 470.5, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.12156428703191294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0247802734375, "kl": 0.007138237240724266, "learning_rate": 1.2129032258064518e-05, "loss": 0.0003, "num_tokens": 5764649.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.12174875484228002, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.01998111023567617, "learning_rate": 1.214746543778802e-05, "loss": 0.0008, "num_tokens": 5773280.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.12193322265264711, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.021241999347694218, "learning_rate": 1.2165898617511523e-05, "loss": 0.0008, "num_tokens": 5780102.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.1221176904630142, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.02732308954000473, "learning_rate": 1.2184331797235026e-05, "loss": 0.0011, "num_tokens": 5789164.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.1223021582733813, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.011022712336853147, "learning_rate": 1.2202764976958526e-05, "loss": 0.0004, "num_tokens": 5796059.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 399.125, "completions/mean_terminated_length": 399.125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.12248662608374838, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.020758160739205778, "learning_rate": 1.2221198156682029e-05, "loss": 0.0008, "num_tokens": 5806644.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 409.0, "completions/mean_terminated_length": 409.0, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.12267109389411547, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.016644110437482595, "learning_rate": 1.223963133640553e-05, "loss": 0.0007, "num_tokens": 5818980.0, "reward": 1.2419354915618896, "reward_std": 0.3325643837451935, "rewards/fixed_code_pass_all_test_reward/mean": 0.24193547666072845, "rewards/fixed_code_pass_all_test_reward/std": 0.3325643837451935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.12285556170448257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.012655959464609623, "learning_rate": 1.2258064516129034e-05, "loss": 0.0005, "num_tokens": 5825847.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.12304002951484966, "frac_reward_zero_std": 1.0, "grad_norm": 0.333984375, "kl": 0.04878741386346519, "learning_rate": 1.2276497695852537e-05, "loss": 0.002, "num_tokens": 5829652.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.12322449732521674, "frac_reward_zero_std": 1.0, "grad_norm": 0.040771484375, "kl": 0.00492924073478207, "learning_rate": 1.2294930875576038e-05, "loss": 0.0002, "num_tokens": 5834812.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.12340896513558385, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.016838991665281355, "learning_rate": 1.231336405529954e-05, "loss": 0.0007, "num_tokens": 5840693.0, "reward": 1.4666666984558105, "reward_std": 0.33806174993515015, "rewards/fixed_code_pass_all_test_reward/mean": 0.46666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.33806169033050537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 105.625, "completions/mean_terminated_length": 105.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.12359343294595093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.010949868243187666, "learning_rate": 1.2331797235023041e-05, "loss": 0.0004, "num_tokens": 5844234.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 584.5, "completions/mean_terminated_length": 584.5, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.12377790075631802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.009572273876983672, "learning_rate": 1.2350230414746545e-05, "loss": 0.0004, "num_tokens": 5854638.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.12396236856668512, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.0073857964016497135, "learning_rate": 1.2368663594470048e-05, "loss": 0.0003, "num_tokens": 5861388.0, "reward": 1.6944444179534912, "reward_std": 0.5134865045547485, "rewards/fixed_code_pass_all_test_reward/mean": 0.8194444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.25153848528862, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 480.375, "completions/mean_terminated_length": 480.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1241468363770522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.012636109720915556, "learning_rate": 1.238709677419355e-05, "loss": 0.0005, "num_tokens": 5870871.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.12433130418741929, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.016431959345936775, "learning_rate": 1.2405529953917053e-05, "loss": 0.0007, "num_tokens": 5877088.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 500.5, "completions/mean_terminated_length": 500.5, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.12451577199778639, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.011068681604228914, "learning_rate": 1.2423963133640553e-05, "loss": 0.0004, "num_tokens": 5885404.0, "reward": 1.6785714626312256, "reward_std": 0.27027443051338196, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.27027443051338196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.12470023980815348, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.01131491910200566, "learning_rate": 1.2442396313364056e-05, "loss": 0.0005, "num_tokens": 5894584.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.12488470761852057, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.006685057916911319, "learning_rate": 1.2460829493087559e-05, "loss": 0.0003, "num_tokens": 5900255.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.12506917542888765, "frac_reward_zero_std": 1.0, "grad_norm": 0.24609375, "kl": 0.02956463862210512, "learning_rate": 1.247926267281106e-05, "loss": 0.0012, "num_tokens": 5908749.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 266.5, "completions/mean_terminated_length": 266.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.12525364323925475, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.008470706292428076, "learning_rate": 1.2497695852534564e-05, "loss": 0.0003, "num_tokens": 5914297.0, "reward": 1.7999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.12543811104962185, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.01003311324166134, "learning_rate": 1.2516129032258067e-05, "loss": 0.0004, "num_tokens": 5922688.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.12562257885998893, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.01014643389498815, "learning_rate": 1.2534562211981567e-05, "loss": 0.0004, "num_tokens": 5927764.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 351.625, "completions/mean_terminated_length": 351.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.12580704667035603, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.014334492763737217, "learning_rate": 1.255299539170507e-05, "loss": 0.0006, "num_tokens": 5933401.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 514.125, "completions/mean_terminated_length": 514.125, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.12599151448072313, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.009524707915261388, "learning_rate": 1.2571428571428572e-05, "loss": 0.0004, "num_tokens": 5946890.0, "reward": 0.8981481194496155, "reward_std": 0.3484238386154175, "rewards/fixed_code_pass_all_test_reward/mean": 0.023148149251937866, "rewards/fixed_code_pass_all_test_reward/std": 0.019168486818671227, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1261759822910902, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.013763547642156482, "learning_rate": 1.2589861751152075e-05, "loss": 0.0006, "num_tokens": 5954399.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 317.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1263604501014573, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.00900739652570337, "learning_rate": 1.2608294930875578e-05, "loss": 0.0004, "num_tokens": 5962546.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 402.875, "completions/mean_terminated_length": 402.875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.12654491791182437, "frac_reward_zero_std": 1.0, "grad_norm": 0.038330078125, "kl": 0.012674597732257098, "learning_rate": 1.262672811059908e-05, "loss": 0.0005, "num_tokens": 5970577.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.12672938572219147, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.01283425884321332, "learning_rate": 1.2645161290322581e-05, "loss": 0.0005, "num_tokens": 5978612.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.12691385353255857, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.009311013913247734, "learning_rate": 1.2663594470046083e-05, "loss": 0.0004, "num_tokens": 5983202.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 581.5, "completions/mean_terminated_length": 372.0000305175781, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.12709832134292565, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.017149349150713533, "learning_rate": 1.2682027649769586e-05, "loss": 0.0007, "num_tokens": 5997262.0, "reward": 1.4905303716659546, "reward_std": 0.6186563372612, "rewards/fixed_code_pass_all_test_reward/mean": 0.6155303120613098, "rewards/fixed_code_pass_all_test_reward/std": 0.28612905740737915, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.12728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.022962539340369403, "learning_rate": 1.2700460829493089e-05, "loss": 0.0009, "num_tokens": 6004764.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.12746725696365985, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.009973986627301201, "learning_rate": 1.271889400921659e-05, "loss": 0.0004, "num_tokens": 6013638.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.12765172477402692, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.024273328890558332, "learning_rate": 1.2737327188940094e-05, "loss": 0.001, "num_tokens": 6017529.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.12783619258439402, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.014770885405596346, "learning_rate": 1.2755760368663594e-05, "loss": 0.0006, "num_tokens": 6022373.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.12802066039476112, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.056296321912668645, "learning_rate": 1.2774193548387097e-05, "loss": 0.0023, "num_tokens": 6030612.0, "reward": 1.6052632331848145, "reward_std": 0.3740176558494568, "rewards/fixed_code_pass_all_test_reward/mean": 0.6052631139755249, "rewards/fixed_code_pass_all_test_reward/std": 0.3740176260471344, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 457.625, "completions/mean_terminated_length": 457.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1282051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.013139610411599278, "learning_rate": 1.27926267281106e-05, "loss": 0.0005, "num_tokens": 6041921.0, "reward": 1.4964789152145386, "reward_std": 0.3884108066558838, "rewards/fixed_code_pass_all_test_reward/mean": 0.6214789152145386, "rewards/fixed_code_pass_all_test_reward/std": 0.034857384860515594, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.1283895960154953, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.016443020664155483, "learning_rate": 1.2811059907834102e-05, "loss": 0.0007, "num_tokens": 6048047.0, "reward": 1.01260507106781, "reward_std": 0.03565245121717453, "rewards/fixed_code_pass_all_test_reward/mean": 0.012605042196810246, "rewards/fixed_code_pass_all_test_reward/std": 0.03565244376659393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 410.125, "completions/mean_terminated_length": 410.125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.1285740638258624, "frac_reward_zero_std": 0.0, "grad_norm": 0.69921875, "kl": 0.009359834366478026, "learning_rate": 1.2829493087557605e-05, "loss": 0.0004, "num_tokens": 6057568.0, "reward": 1.625, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 494.125, "completions/mean_terminated_length": 494.125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.12875853163622947, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.008197460934752598, "learning_rate": 1.2847926267281108e-05, "loss": 0.0003, "num_tokens": 6067049.0, "reward": 1.7675000429153442, "reward_std": 0.09498120844364166, "rewards/fixed_code_pass_all_test_reward/mean": 0.7675000429153442, "rewards/fixed_code_pass_all_test_reward/std": 0.09498120844364166, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.12894299944659657, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.03380183281842619, "learning_rate": 1.2866359447004608e-05, "loss": 0.0014, "num_tokens": 6071025.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.12912746725696367, "frac_reward_zero_std": 1.0, "grad_norm": 0.81640625, "kl": 0.06187940342351794, "learning_rate": 1.2884792626728111e-05, "loss": 0.0025, "num_tokens": 6080171.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.12931193506733074, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.0343069777591154, "learning_rate": 1.2903225806451613e-05, "loss": 0.0014, "num_tokens": 6088596.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 407.125, "completions/mean_terminated_length": 407.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.12949640287769784, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.01815927936695516, "learning_rate": 1.2921658986175116e-05, "loss": 0.0007, "num_tokens": 6096477.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.12968087068806494, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.025725731742568314, "learning_rate": 1.294009216589862e-05, "loss": 0.001, "num_tokens": 6102656.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.12986533849843201, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.018579981464426965, "learning_rate": 1.2958525345622122e-05, "loss": 0.0007, "num_tokens": 6113211.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.13004980630879912, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.026171888457611203, "learning_rate": 1.2976958525345624e-05, "loss": 0.001, "num_tokens": 6122124.0, "reward": 1.0299999713897705, "reward_std": 0.08485280722379684, "rewards/fixed_code_pass_all_test_reward/mean": 0.029999999329447746, "rewards/fixed_code_pass_all_test_reward/std": 0.08485281467437744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.13023427411916622, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.03152118972502649, "learning_rate": 1.2995391705069126e-05, "loss": 0.0013, "num_tokens": 6130544.0, "reward": 1.3571429252624512, "reward_std": 0.4501376152038574, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2645200192928314, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 350.25, "completions/mean_terminated_length": 350.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.1304187419295333, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.018276812275871634, "learning_rate": 1.3013824884792627e-05, "loss": 0.0007, "num_tokens": 6142418.0, "reward": 1.78125, "reward_std": 0.3582572042942047, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.3582572042942047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1306032097399004, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875, "kl": 0.027830975246615708, "learning_rate": 1.303225806451613e-05, "loss": 0.0011, "num_tokens": 6148754.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 757.625, "completions/mean_terminated_length": 573.2857666015625, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.1307876775502675, "frac_reward_zero_std": 0.0, "grad_norm": 0.255859375, "kl": 0.005082618095912039, "learning_rate": 1.3050691244239634e-05, "loss": 0.0002, "num_tokens": 6161671.0, "reward": 1.0208332538604736, "reward_std": 0.4124789237976074, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 397.0, "completions/mean_terminated_length": 397.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.13097214536063456, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.01624119805637747, "learning_rate": 1.3069124423963135e-05, "loss": 0.0006, "num_tokens": 6169303.0, "reward": 1.5988805294036865, "reward_std": 0.6742870211601257, "rewards/fixed_code_pass_all_test_reward/mean": 0.7238805890083313, "rewards/fixed_code_pass_all_test_reward/std": 0.35048529505729675, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 346.75, "completions/mean_terminated_length": 346.75, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.13115661317100166, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.03460461529903114, "learning_rate": 1.3087557603686638e-05, "loss": 0.0014, "num_tokens": 6178869.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.13134108098136876, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.021799714537337422, "learning_rate": 1.3105990783410138e-05, "loss": 0.0009, "num_tokens": 6189030.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 619.75, "completions/mean_terminated_length": 619.75, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.13152554879173584, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.016654736478812993, "learning_rate": 1.3124423963133641e-05, "loss": 0.0007, "num_tokens": 6201876.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.13171001660210294, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.018368340213783085, "learning_rate": 1.3142857142857145e-05, "loss": 0.0007, "num_tokens": 6206629.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 218.5, "completions/mean_terminated_length": 218.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.13189448441247004, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.019721908261999488, "learning_rate": 1.3161290322580646e-05, "loss": 0.0008, "num_tokens": 6213073.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.1320789522228371, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.02149338141316548, "learning_rate": 1.317972350230415e-05, "loss": 0.0009, "num_tokens": 6224487.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 891.0, "completions/mean_terminated_length": 505.3333435058594, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.1322634200332042, "frac_reward_zero_std": 0.0, "grad_norm": 0.53515625, "kl": 0.015899461170192808, "learning_rate": 1.3198156682027653e-05, "loss": 0.0006, "num_tokens": 6237023.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 525.0, "completions/mean_terminated_length": 525.0, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.1324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.012928415031637996, "learning_rate": 1.3216589861751152e-05, "loss": 0.0005, "num_tokens": 6246591.0, "reward": 1.6623376607894897, "reward_std": 0.17465344071388245, "rewards/fixed_code_pass_all_test_reward/mean": 0.6623376607894897, "rewards/fixed_code_pass_all_test_reward/std": 0.17465342581272125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 607.25, "completions/mean_terminated_length": 607.25, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.13263235565393838, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.0160870713298209, "learning_rate": 1.3235023041474656e-05, "loss": 0.0006, "num_tokens": 6257681.0, "reward": 1.5833333730697632, "reward_std": 0.1543033868074417, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.13281682346430548, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.030950271990150213, "learning_rate": 1.3253456221198157e-05, "loss": 0.0012, "num_tokens": 6267121.0, "reward": 1.1363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.13300129127467256, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.013472165039274842, "learning_rate": 1.327188940092166e-05, "loss": 0.0005, "num_tokens": 6272134.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.13318575908503966, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.03463879611808807, "learning_rate": 1.3290322580645164e-05, "loss": 0.0014, "num_tokens": 6281695.0, "reward": 1.8046875, "reward_std": 0.36164847016334534, "rewards/fixed_code_pass_all_test_reward/mean": 0.8046875, "rewards/fixed_code_pass_all_test_reward/std": 0.36164847016334534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 411.75, "completions/mean_terminated_length": 411.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.13337022689540676, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.01731483032926917, "learning_rate": 1.3308755760368665e-05, "loss": 0.0007, "num_tokens": 6291781.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.13355469470577383, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.032740039052441716, "learning_rate": 1.3327188940092167e-05, "loss": 0.0013, "num_tokens": 6295615.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.13373916251614093, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.038838914944790304, "learning_rate": 1.3345622119815668e-05, "loss": 0.0016, "num_tokens": 6300739.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 529.125, "completions/mean_terminated_length": 529.125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.13392363032650803, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.015299848339054734, "learning_rate": 1.3364055299539171e-05, "loss": 0.0006, "num_tokens": 6311716.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 421.125, "completions/mean_terminated_length": 421.125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.1341080981368751, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.0211512305540964, "learning_rate": 1.3382488479262675e-05, "loss": 0.0008, "num_tokens": 6322285.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1342925659472422, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.026305558742024004, "learning_rate": 1.3400921658986176e-05, "loss": 0.0011, "num_tokens": 6333773.0, "reward": 1.322115421295166, "reward_std": 0.3137706220149994, "rewards/fixed_code_pass_all_test_reward/mean": 0.322115421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.31377068161964417, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 423.125, "completions/mean_terminated_length": 423.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.030887339846231043, "learning_rate": 1.341935483870968e-05, "loss": 0.0012, "num_tokens": 6344086.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.13466150156797638, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.02194003330077976, "learning_rate": 1.343778801843318e-05, "loss": 0.0009, "num_tokens": 6353355.0, "reward": 1.5646929740905762, "reward_std": 0.1643713414669037, "rewards/fixed_code_pass_all_test_reward/mean": 0.5646929740905762, "rewards/fixed_code_pass_all_test_reward/std": 0.1643713265657425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.13484596937834348, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.022425944334827363, "learning_rate": 1.3456221198156683e-05, "loss": 0.0009, "num_tokens": 6357882.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.13503043718871058, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.027519751572981477, "learning_rate": 1.3474654377880186e-05, "loss": 0.0011, "num_tokens": 6366934.0, "reward": 1.946969747543335, "reward_std": 0.14999237656593323, "rewards/fixed_code_pass_all_test_reward/mean": 0.9469696879386902, "rewards/fixed_code_pass_all_test_reward/std": 0.14999234676361084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.13521490499907765, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.031093080062419176, "learning_rate": 1.3493087557603687e-05, "loss": 0.0012, "num_tokens": 6375602.0, "reward": 1.4375, "reward_std": 0.6781013607978821, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 580.25, "completions/mean_terminated_length": 580.25, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.13539937280944475, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.04468227177858353, "learning_rate": 1.351152073732719e-05, "loss": 0.0018, "num_tokens": 6392132.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.13558384061981185, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.024280450074002147, "learning_rate": 1.3529953917050694e-05, "loss": 0.001, "num_tokens": 6398625.0, "reward": 1.1458332538604736, "reward_std": 0.03857579827308655, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.03857583925127983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.13576830843017892, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.026149121462367475, "learning_rate": 1.3548387096774194e-05, "loss": 0.001, "num_tokens": 6406457.0, "reward": 1.9358108043670654, "reward_std": 0.18155446648597717, "rewards/fixed_code_pass_all_test_reward/mean": 0.9358108043670654, "rewards/fixed_code_pass_all_test_reward/std": 0.18155445158481598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.13595277624054602, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.03232508595101535, "learning_rate": 1.3566820276497697e-05, "loss": 0.0013, "num_tokens": 6415186.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 766.25, "completions/mean_terminated_length": 766.25, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.13613724405091313, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.015500634792260826, "learning_rate": 1.3585253456221198e-05, "loss": 0.0006, "num_tokens": 6431556.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 391.5, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.1363217118612802, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.022174455225467682, "learning_rate": 1.3603686635944702e-05, "loss": 0.0009, "num_tokens": 6439840.0, "reward": 1.640625, "reward_std": 0.4288038909435272, "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, "rewards/fixed_code_pass_all_test_reward/std": 0.4288038909435272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 355.625, "completions/mean_terminated_length": 355.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.1365061796716473, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.014136144774965942, "learning_rate": 1.3622119815668205e-05, "loss": 0.0006, "num_tokens": 6447061.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 379.0, "completions/mean_terminated_length": 379.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.1366906474820144, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.022066552191972733, "learning_rate": 1.3640552995391706e-05, "loss": 0.0009, "num_tokens": 6456221.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 542.5, "completions/mean_terminated_length": 542.5, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.13687511529238147, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.014162875013425946, "learning_rate": 1.3658986175115208e-05, "loss": 0.0006, "num_tokens": 6470641.0, "reward": 1.8977272510528564, "reward_std": 0.14114977419376373, "rewards/fixed_code_pass_all_test_reward/mean": 0.8977272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.14114975929260254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.13705958310274857, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.020656749606132507, "learning_rate": 1.367741935483871e-05, "loss": 0.0008, "num_tokens": 6476501.0, "reward": 1.2857142686843872, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 492.125, "completions/mean_terminated_length": 492.125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.13724405091311567, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.011496823804918677, "learning_rate": 1.3695852534562213e-05, "loss": 0.0005, "num_tokens": 6485814.0, "reward": 1.2678570747375488, "reward_std": 0.08321178704500198, "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.08321177214384079, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13742851872348275, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.02410691638942808, "learning_rate": 1.3714285714285716e-05, "loss": 0.001, "num_tokens": 6490399.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 405.25, "completions/mean_terminated_length": 405.25, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.13761298653384985, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.026261798106133938, "learning_rate": 1.3732718894009217e-05, "loss": 0.0011, "num_tokens": 6502161.0, "reward": 1.4375, "reward_std": 0.3922051787376404, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.39220529794692993, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.13779745434421695, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.02647591312415898, "learning_rate": 1.375115207373272e-05, "loss": 0.0011, "num_tokens": 6506800.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.13798192215458402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0380859375, "kl": 0.013860256178304553, "learning_rate": 1.3769585253456222e-05, "loss": 0.0006, "num_tokens": 6512116.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 395.25, "completions/mean_terminated_length": 395.25, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.13816638996495112, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.01572678005322814, "learning_rate": 1.3788018433179724e-05, "loss": 0.0006, "num_tokens": 6521822.0, "reward": 1.8333332538604736, "reward_std": 0.32120805978775024, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.32120802998542786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 284.0, "completions/mean_terminated_length": 284.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.13835085777531822, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.03260247875005007, "learning_rate": 1.3806451612903227e-05, "loss": 0.0013, "num_tokens": 6530222.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 973.125, "completions/mean_terminated_length": 819.5714721679688, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.1385353255856853, "frac_reward_zero_std": 0.0, "grad_norm": 0.5, "kl": 0.010894590872339904, "learning_rate": 1.382488479262673e-05, "loss": 0.0004, "num_tokens": 6548431.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 473.75, "completions/mean_terminated_length": 473.75, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.1387197933960524, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.016499250603374094, "learning_rate": 1.3843317972350232e-05, "loss": 0.0007, "num_tokens": 6562293.0, "reward": 1.6306817531585693, "reward_std": 0.2553298771381378, "rewards/fixed_code_pass_all_test_reward/mean": 0.6306818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.25532984733581543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.13890426120641947, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.028235030127689242, "learning_rate": 1.3861751152073735e-05, "loss": 0.0011, "num_tokens": 6572300.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.13908872901678657, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.03183341957628727, "learning_rate": 1.3880184331797235e-05, "loss": 0.0013, "num_tokens": 6580468.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 457.125, "completions/mean_terminated_length": 457.125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.13927319682715367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0498046875, "kl": 0.01674209046177566, "learning_rate": 1.3898617511520738e-05, "loss": 0.0007, "num_tokens": 6589141.0, "reward": 1.1875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.13945766463752074, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.028881902107968926, "learning_rate": 1.3917050691244241e-05, "loss": 0.0012, "num_tokens": 6593445.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.13964213244788784, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.03357564425095916, "learning_rate": 1.3935483870967743e-05, "loss": 0.0013, "num_tokens": 6597383.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 472.0, "completions/mean_terminated_length": 472.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.13982660025825494, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.021979342098347843, "learning_rate": 1.3953917050691246e-05, "loss": 0.0009, "num_tokens": 6608855.0, "reward": 1.7644230127334595, "reward_std": 0.0951874852180481, "rewards/fixed_code_pass_all_test_reward/mean": 0.7644230127334595, "rewards/fixed_code_pass_all_test_reward/std": 0.0951874628663063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.140011068068622, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.02389827778097242, "learning_rate": 1.397235023041475e-05, "loss": 0.001, "num_tokens": 6614580.0, "reward": 1.3974056243896484, "reward_std": 0.12159383296966553, "rewards/fixed_code_pass_all_test_reward/mean": 0.3974056839942932, "rewards/fixed_code_pass_all_test_reward/std": 0.12159384787082672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 80.25, "completions/mean_terminated_length": 80.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.1401955358789891, "frac_reward_zero_std": 1.0, "grad_norm": 0.7578125, "kl": 0.07828154135495424, "learning_rate": 1.3990783410138249e-05, "loss": 0.0031, "num_tokens": 6618038.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 367.5, "completions/mean_terminated_length": 367.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.14038000368935621, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.008564533665776253, "learning_rate": 1.4009216589861752e-05, "loss": 0.0003, "num_tokens": 6625562.0, "reward": 1.8333332538604736, "reward_std": 0.34503278136253357, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.07715168595314026, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 562.125, "completions/mean_terminated_length": 562.125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.1405644714997233, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.015049646375700831, "learning_rate": 1.4027649769585254e-05, "loss": 0.0006, "num_tokens": 6639067.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 341.5, "completions/mean_terminated_length": 341.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.1407489393100904, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.027179012540727854, "learning_rate": 1.4046082949308757e-05, "loss": 0.0011, "num_tokens": 6649743.0, "reward": 1.0909091234207153, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.09090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.1409334071204575, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.03725534223485738, "learning_rate": 1.406451612903226e-05, "loss": 0.0015, "num_tokens": 6654285.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 541.0, "completions/mean_terminated_length": 541.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.14111787493082456, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.015308193862438202, "learning_rate": 1.4082949308755762e-05, "loss": 0.0006, "num_tokens": 6666349.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 471.5, "completions/mean_terminated_length": 471.5, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.14130234274119166, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.009917626099195331, "learning_rate": 1.4101382488479263e-05, "loss": 0.0004, "num_tokens": 6675841.0, "reward": 1.6666667461395264, "reward_std": 0.46004366874694824, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.460043728351593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 523.875, "completions/mean_terminated_length": 306.14288330078125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.14148681055155876, "frac_reward_zero_std": 0.0, "grad_norm": 0.48828125, "kl": 0.009823341250012163, "learning_rate": 1.4119815668202765e-05, "loss": 0.0004, "num_tokens": 6686856.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.14167127836192583, "frac_reward_zero_std": 1.0, "grad_norm": 0.294921875, "kl": 0.030536632519215345, "learning_rate": 1.4138248847926268e-05, "loss": 0.0012, "num_tokens": 6693845.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.14185574617229293, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.0308961751870811, "learning_rate": 1.4156682027649771e-05, "loss": 0.0012, "num_tokens": 6699701.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.14204021398266004, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.01216026209294796, "learning_rate": 1.4175115207373273e-05, "loss": 0.0005, "num_tokens": 6705565.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 590.375, "completions/mean_terminated_length": 590.375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.1422246817930271, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.008934514684369788, "learning_rate": 1.4193548387096776e-05, "loss": 0.0004, "num_tokens": 6716352.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.1424091496033942, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.01670481136534363, "learning_rate": 1.421198156682028e-05, "loss": 0.0007, "num_tokens": 6723852.0, "reward": 1.5833333730697632, "reward_std": 0.2357023060321808, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1425936174137613, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.033803693018853664, "learning_rate": 1.4230414746543779e-05, "loss": 0.0014, "num_tokens": 6728408.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.14277808522412838, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.059999131597578526, "learning_rate": 1.4248847926267282e-05, "loss": 0.0024, "num_tokens": 6732277.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.14296255303449548, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.01742211007513106, "learning_rate": 1.4267281105990784e-05, "loss": 0.0007, "num_tokens": 6742151.0, "reward": 1.9402778148651123, "reward_std": 0.08397896587848663, "rewards/fixed_code_pass_all_test_reward/mean": 0.9402777552604675, "rewards/fixed_code_pass_all_test_reward/std": 0.08397898077964783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.14314702084486258, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.03779052116442472, "learning_rate": 1.4285714285714287e-05, "loss": 0.0015, "num_tokens": 6750569.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.14333148865522966, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05229167826473713, "learning_rate": 1.430414746543779e-05, "loss": 0.0021, "num_tokens": 6758814.0, "reward": 1.2083332538604736, "reward_std": 0.28752732276916504, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.2875273525714874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.14351595646559676, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.017067412962205708, "learning_rate": 1.4322580645161292e-05, "loss": 0.0007, "num_tokens": 6768372.0, "reward": 1.777298927307129, "reward_std": 0.01219146978110075, "rewards/fixed_code_pass_all_test_reward/mean": 0.7772988080978394, "rewards/fixed_code_pass_all_test_reward/std": 0.012191482819616795, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 391.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.14370042427596386, "frac_reward_zero_std": 1.0, "grad_norm": 0.35546875, "kl": 0.029603343922644854, "learning_rate": 1.4341013824884793e-05, "loss": 0.0012, "num_tokens": 6776480.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.14388489208633093, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.027577376225963235, "learning_rate": 1.4359447004608295e-05, "loss": 0.0011, "num_tokens": 6782116.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.14406935989669803, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.045364767545834184, "learning_rate": 1.4377880184331798e-05, "loss": 0.0018, "num_tokens": 6791033.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 277.125, "completions/mean_terminated_length": 277.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.14425382770706513, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.019597986130975187, "learning_rate": 1.4396313364055301e-05, "loss": 0.0008, "num_tokens": 6799026.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.1444382955174322, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.029411056311801076, "learning_rate": 1.4414746543778803e-05, "loss": 0.0012, "num_tokens": 6807654.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.1446227633277993, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.03787960729096085, "learning_rate": 1.4433179723502306e-05, "loss": 0.0015, "num_tokens": 6815053.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.1448072311381664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.029287808923982084, "learning_rate": 1.4451612903225806e-05, "loss": 0.0012, "num_tokens": 6825862.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.14499169894853348, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.02604480367153883, "learning_rate": 1.447004608294931e-05, "loss": 0.001, "num_tokens": 6833291.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.14517616675890058, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.029392108554020524, "learning_rate": 1.4488479262672812e-05, "loss": 0.0012, "num_tokens": 6839866.0, "reward": 1.1339285373687744, "reward_std": 0.8333758115768433, "rewards/fixed_code_pass_all_test_reward/mean": 0.3839285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.510726809501648, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.14536063456926765, "frac_reward_zero_std": 1.0, "grad_norm": 15.5625, "kl": 0.7103681610897183, "learning_rate": 1.4506912442396314e-05, "loss": 0.0284, "num_tokens": 6843656.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 664.375, "completions/mean_terminated_length": 664.375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.14554510237963475, "frac_reward_zero_std": 0.0, "grad_norm": 8.0, "kl": 0.01622362481430173, "learning_rate": 1.4525345622119817e-05, "loss": 0.0006, "num_tokens": 6854675.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.14572957019000185, "frac_reward_zero_std": 1.0, "grad_norm": 0.2451171875, "kl": 0.018583374738227576, "learning_rate": 1.454377880184332e-05, "loss": 0.0007, "num_tokens": 6861820.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.14591403800036892, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.03317134128883481, "learning_rate": 1.456221198156682e-05, "loss": 0.0013, "num_tokens": 6865850.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 570.125, "completions/mean_terminated_length": 359.0000305175781, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.14609850581073602, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.015854478522669524, "learning_rate": 1.4580645161290324e-05, "loss": 0.0006, "num_tokens": 6877371.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.14628297362110312, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.016353400889784098, "learning_rate": 1.4599078341013827e-05, "loss": 0.0007, "num_tokens": 6884726.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1464674414314702, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.014562718919478357, "learning_rate": 1.4617511520737328e-05, "loss": 0.0006, "num_tokens": 6890822.0, "reward": 1.75, "reward_std": 0.3505098223686218, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3505098521709442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1466519092418373, "frac_reward_zero_std": 1.0, "grad_norm": 0.185546875, "kl": 0.0419701935024932, "learning_rate": 1.4635944700460832e-05, "loss": 0.0017, "num_tokens": 6894897.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 281.875, "completions/mean_terminated_length": 281.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.1468363770522044, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.023721805890090764, "learning_rate": 1.4654377880184335e-05, "loss": 0.0009, "num_tokens": 6901224.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.14702084486257147, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.014760594698600471, "learning_rate": 1.4672811059907835e-05, "loss": 0.0006, "num_tokens": 6909312.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.14720531267293857, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.025285649637226015, "learning_rate": 1.4691244239631338e-05, "loss": 0.001, "num_tokens": 6916214.0, "reward": 1.4318182468414307, "reward_std": 0.2780858874320984, "rewards/fixed_code_pass_all_test_reward/mean": 0.4318181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.2780858874320984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.14738978048330567, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.01802999951178208, "learning_rate": 1.470967741935484e-05, "loss": 0.0007, "num_tokens": 6920849.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.14757424829367274, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.020955910498742014, "learning_rate": 1.4728110599078343e-05, "loss": 0.0008, "num_tokens": 6929161.0, "reward": 1.4772727489471436, "reward_std": 0.2945791482925415, "rewards/fixed_code_pass_all_test_reward/mean": 0.47727271914482117, "rewards/fixed_code_pass_all_test_reward/std": 0.2945791482925415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.14775871610403984, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.02193617168813944, "learning_rate": 1.4746543778801846e-05, "loss": 0.0009, "num_tokens": 6933296.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.14794318391440694, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.03128149639815092, "learning_rate": 1.4764976958525347e-05, "loss": 0.0013, "num_tokens": 6944145.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 347.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.14812765172477402, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.015381735400296748, "learning_rate": 1.4783410138248849e-05, "loss": 0.0006, "num_tokens": 6950793.0, "reward": 1.759615421295166, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8846153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 390.5, "completions/mean_terminated_length": 390.5, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.14831211953514112, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638671875, "kl": 0.015157002257183194, "learning_rate": 1.480184331797235e-05, "loss": 0.0006, "num_tokens": 6958445.0, "reward": 1.4285714626312256, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.14849658734550822, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.023545119212940335, "learning_rate": 1.4820276497695854e-05, "loss": 0.0009, "num_tokens": 6966449.0, "reward": 1.0272727012634277, "reward_std": 0.04007076844573021, "rewards/fixed_code_pass_all_test_reward/mean": 0.027272727340459824, "rewards/fixed_code_pass_all_test_reward/std": 0.040070775896310806, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1486810551558753, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.01722046302165836, "learning_rate": 1.4838709677419357e-05, "loss": 0.0007, "num_tokens": 6970979.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 448.875, "completions/mean_terminated_length": 448.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.1488655229662424, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.03055403829785064, "learning_rate": 1.4857142857142858e-05, "loss": 0.0012, "num_tokens": 6979674.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 376.375, "completions/mean_terminated_length": 376.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.1490499907766095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.02564426069147885, "learning_rate": 1.4875576036866362e-05, "loss": 0.001, "num_tokens": 6989245.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 335.875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.14923445858697656, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.02073127235053107, "learning_rate": 1.4894009216589861e-05, "loss": 0.0008, "num_tokens": 6998452.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.14941892639734367, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.03303106583189219, "learning_rate": 1.4912442396313365e-05, "loss": 0.0013, "num_tokens": 7005352.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 433.75, "completions/mean_terminated_length": 433.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.14960339420771077, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564453125, "kl": 0.008066351554589346, "learning_rate": 1.4930875576036868e-05, "loss": 0.0003, "num_tokens": 7013254.0, "reward": 1.9090909957885742, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.14978786201807784, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.011897749034687877, "learning_rate": 1.494930875576037e-05, "loss": 0.0005, "num_tokens": 7018321.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.14997232982844494, "frac_reward_zero_std": 1.0, "grad_norm": 0.50390625, "kl": 0.037915762164629996, "learning_rate": 1.4967741935483873e-05, "loss": 0.0015, "num_tokens": 7027078.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 487.75, "completions/mean_terminated_length": 487.75, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.15015679763881204, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.014756782446056604, "learning_rate": 1.4986175115207376e-05, "loss": 0.0006, "num_tokens": 7035412.0, "reward": 1.6195652484893799, "reward_std": 0.3381814956665039, "rewards/fixed_code_pass_all_test_reward/mean": 0.7445651888847351, "rewards/fixed_code_pass_all_test_reward/std": 0.11251069605350494, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1503412654491791, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.016000698611605912, "learning_rate": 1.5004608294930876e-05, "loss": 0.0006, "num_tokens": 7041478.0, "reward": 1.5535714626312256, "reward_std": 0.4855791926383972, "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714030265808, "rewards/fixed_code_pass_all_test_reward/std": 0.3657134771347046, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 372.375, "completions/mean_terminated_length": 372.375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.1505257332595462, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.015778279514051974, "learning_rate": 1.5023041474654379e-05, "loss": 0.0006, "num_tokens": 7049233.0, "reward": 1.144736886024475, "reward_std": 0.3465586304664612, "rewards/fixed_code_pass_all_test_reward/mean": 0.14473684132099152, "rewards/fixed_code_pass_all_test_reward/std": 0.34655866026878357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1507102010699133, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.021435981965623796, "learning_rate": 1.504147465437788e-05, "loss": 0.0009, "num_tokens": 7058541.0, "reward": 1.5487804412841797, "reward_std": 0.4442201256752014, "rewards/fixed_code_pass_all_test_reward/mean": 0.5487804412841797, "rewards/fixed_code_pass_all_test_reward/std": 0.4442201256752014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.15089466888028039, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.01879781624302268, "learning_rate": 1.5059907834101384e-05, "loss": 0.0008, "num_tokens": 7064475.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.1510791366906475, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.015623708255589008, "learning_rate": 1.5078341013824887e-05, "loss": 0.0006, "num_tokens": 7072236.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.15126360450101456, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.01770372991450131, "learning_rate": 1.5096774193548389e-05, "loss": 0.0007, "num_tokens": 7080564.0, "reward": 1.84375, "reward_std": 0.34092646837234497, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.34092649817466736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.15144807231138166, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03773631388321519, "learning_rate": 1.511520737327189e-05, "loss": 0.0015, "num_tokens": 7088499.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 599.25, "completions/mean_terminated_length": 599.25, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.15163254012174876, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.011290738882962614, "learning_rate": 1.5133640552995392e-05, "loss": 0.0005, "num_tokens": 7099669.0, "reward": 1.8020833730697632, "reward_std": 0.3240906596183777, "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.15181700793211583, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.021938687015790492, "learning_rate": 1.5152073732718895e-05, "loss": 0.0009, "num_tokens": 7106585.0, "reward": 1.7592592239379883, "reward_std": 0.24084247648715973, "rewards/fixed_code_pass_all_test_reward/mean": 0.7592592239379883, "rewards/fixed_code_pass_all_test_reward/std": 0.24084246158599854, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 520.125, "completions/mean_terminated_length": 520.125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.15200147574248293, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.0087799089087639, "learning_rate": 1.5170506912442398e-05, "loss": 0.0004, "num_tokens": 7118442.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.15218594355285003, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.02226733387215063, "learning_rate": 1.51889400921659e-05, "loss": 0.0009, "num_tokens": 7124268.0, "reward": 1.0860215425491333, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 423.75, "completions/mean_terminated_length": 423.75, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.1523704113632171, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.023682945990003645, "learning_rate": 1.5207373271889403e-05, "loss": 0.0009, "num_tokens": 7136738.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 530.875, "completions/mean_terminated_length": 530.875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.1525548791735842, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.011575250828173012, "learning_rate": 1.5225806451612903e-05, "loss": 0.0005, "num_tokens": 7150273.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1527393469839513, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.03281094157136977, "learning_rate": 1.5244239631336406e-05, "loss": 0.0013, "num_tokens": 7154920.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 538.0, "completions/mean_terminated_length": 538.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.15292381479431838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.023295892227906734, "learning_rate": 1.5262672811059907e-05, "loss": 0.0009, "num_tokens": 7166760.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 366.125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.15310828260468548, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.020283473888412118, "learning_rate": 1.5281105990783412e-05, "loss": 0.0008, "num_tokens": 7175665.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 540.625, "completions/mean_terminated_length": 325.2857360839844, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.15329275041505258, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.029170521316700615, "learning_rate": 1.5299539170506914e-05, "loss": 0.0012, "num_tokens": 7187518.0, "reward": 1.0625, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.15347721822541965, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.022016245580744, "learning_rate": 1.5317972350230415e-05, "loss": 0.0009, "num_tokens": 7195043.0, "reward": 1.8333333730697632, "reward_std": 0.30860665440559387, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.30860668420791626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.15366168603578675, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.05257665412500501, "learning_rate": 1.533640552995392e-05, "loss": 0.0021, "num_tokens": 7203385.0, "reward": 1.933823585510254, "reward_std": 0.18717533349990845, "rewards/fixed_code_pass_all_test_reward/mean": 0.9338235259056091, "rewards/fixed_code_pass_all_test_reward/std": 0.18717533349990845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 523.5, "completions/mean_terminated_length": 523.5, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.15384615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.015067261992953718, "learning_rate": 1.535483870967742e-05, "loss": 0.0006, "num_tokens": 7218389.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.15403062165652093, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.01656091643963009, "learning_rate": 1.5373271889400923e-05, "loss": 0.0007, "num_tokens": 7222752.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.15421508946688803, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.023286007111892104, "learning_rate": 1.5391705069124425e-05, "loss": 0.0009, "num_tokens": 7233213.0, "reward": 1.3842593431472778, "reward_std": 0.24879682064056396, "rewards/fixed_code_pass_all_test_reward/mean": 0.38425928354263306, "rewards/fixed_code_pass_all_test_reward/std": 0.24879683554172516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.15439955727725513, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.02792111097369343, "learning_rate": 1.5410138248847926e-05, "loss": 0.0011, "num_tokens": 7257107.0, "reward": 1.4811747074127197, "reward_std": 0.3134256899356842, "rewards/fixed_code_pass_all_test_reward/mean": 0.4811747074127197, "rewards/fixed_code_pass_all_test_reward/std": 0.3134257197380066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 104.0, "completions/mean_terminated_length": 104.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.1545840250876222, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.024813206517137587, "learning_rate": 1.542857142857143e-05, "loss": 0.001, "num_tokens": 7260819.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1547684928979893, "frac_reward_zero_std": 1.0, "grad_norm": 0.326171875, "kl": 0.030742369475774467, "learning_rate": 1.5447004608294933e-05, "loss": 0.0012, "num_tokens": 7269078.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 356.0, "completions/mean_terminated_length": 356.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.1549529607083564, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.023281393514480442, "learning_rate": 1.5465437788018434e-05, "loss": 0.0009, "num_tokens": 7279718.0, "reward": 1.7840908765792847, "reward_std": 0.15923000872135162, "rewards/fixed_code_pass_all_test_reward/mean": 0.7840908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.15923000872135162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.15513742851872347, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.020685943658463657, "learning_rate": 1.5483870967741936e-05, "loss": 0.0008, "num_tokens": 7285747.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.15532189632909058, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.03081200725864619, "learning_rate": 1.5502304147465438e-05, "loss": 0.0012, "num_tokens": 7292810.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 743.5, "completions/mean_terminated_length": 308.66668701171875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.15550636413945768, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.019801245478447527, "learning_rate": 1.5520737327188942e-05, "loss": 0.0008, "num_tokens": 7304574.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.15569083194982475, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.03433478041552007, "learning_rate": 1.5539170506912444e-05, "loss": 0.0014, "num_tokens": 7312604.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.15587529976019185, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.04146273201331496, "learning_rate": 1.5557603686635946e-05, "loss": 0.0017, "num_tokens": 7317076.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.15605976757055895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.014800441393163055, "learning_rate": 1.5576036866359447e-05, "loss": 0.0006, "num_tokens": 7322162.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.15624423538092602, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.06402567820623517, "learning_rate": 1.559447004608295e-05, "loss": 0.0026, "num_tokens": 7330336.0, "reward": 1.8897058963775635, "reward_std": 0.31195884943008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.15642870319129312, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.028404308948665857, "learning_rate": 1.5612903225806454e-05, "loss": 0.0011, "num_tokens": 7340053.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.15661317100166022, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.029230238869786263, "learning_rate": 1.5631336405529955e-05, "loss": 0.0012, "num_tokens": 7345777.0, "reward": 1.0446429252624512, "reward_std": 0.12626908719539642, "rewards/fixed_code_pass_all_test_reward/mean": 0.0446428582072258, "rewards/fixed_code_pass_all_test_reward/std": 0.12626907229423523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.1567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.030965921003371477, "learning_rate": 1.5649769585253457e-05, "loss": 0.0012, "num_tokens": 7357582.0, "reward": 1.3181817531585693, "reward_std": 0.12856486439704895, "rewards/fixed_code_pass_all_test_reward/mean": 0.3181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.12856487929821014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.1569821066223944, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.014108254690654576, "learning_rate": 1.566820276497696e-05, "loss": 0.0006, "num_tokens": 7364544.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 343.125, "completions/mean_terminated_length": 343.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1571665744327615, "frac_reward_zero_std": 1.0, "grad_norm": 1.6171875, "kl": 0.025126139109488577, "learning_rate": 1.568663594470046e-05, "loss": 0.001, "num_tokens": 7371721.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 368.5, "completions/mean_terminated_length": 368.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.15735104224312857, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.024267777684144676, "learning_rate": 1.5705069124423965e-05, "loss": 0.001, "num_tokens": 7383245.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 403.5, "completions/mean_terminated_length": 403.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.15753551005349567, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.0269777748035267, "learning_rate": 1.5723502304147466e-05, "loss": 0.0011, "num_tokens": 7392545.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.15771997786386274, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.02054202405270189, "learning_rate": 1.5741935483870968e-05, "loss": 0.0008, "num_tokens": 7401379.0, "reward": 1.3658536672592163, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3658536672592163, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.15790444567422984, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.027560033951885998, "learning_rate": 1.5760368663594473e-05, "loss": 0.0011, "num_tokens": 7410230.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.15808891348459694, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.028376445872709155, "learning_rate": 1.5778801843317974e-05, "loss": 0.0011, "num_tokens": 7416088.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.15827338129496402, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.02747140621067956, "learning_rate": 1.5797235023041476e-05, "loss": 0.0011, "num_tokens": 7425726.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.15845784910533112, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.019504684838466346, "learning_rate": 1.5815668202764977e-05, "loss": 0.0008, "num_tokens": 7434059.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 564.25, "completions/mean_terminated_length": 564.25, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.15864231691569822, "frac_reward_zero_std": 1.0, "grad_norm": 0.055908203125, "kl": 0.01576250314246863, "learning_rate": 1.5834101382488482e-05, "loss": 0.0006, "num_tokens": 7444317.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1588267847260653, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03001510677859187, "learning_rate": 1.5852534562211984e-05, "loss": 0.0012, "num_tokens": 7452921.0, "reward": 1.471982717514038, "reward_std": 0.5692220330238342, "rewards/fixed_code_pass_all_test_reward/mean": 0.7219827771186829, "rewards/fixed_code_pass_all_test_reward/std": 0.4522782266139984, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1590112525364324, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.0391040020622313, "learning_rate": 1.5870967741935485e-05, "loss": 0.0016, "num_tokens": 7461559.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1591957203467995, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.02142679668031633, "learning_rate": 1.588940092165899e-05, "loss": 0.0009, "num_tokens": 7466756.0, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 361.875, "completions/mean_terminated_length": 361.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.15938018815716656, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.019700519274920225, "learning_rate": 1.5907834101382488e-05, "loss": 0.0008, "num_tokens": 7477195.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.15956465596753366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.021246141870506108, "learning_rate": 1.5926267281105993e-05, "loss": 0.0008, "num_tokens": 7486887.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.15974912377790076, "frac_reward_zero_std": 1.0, "grad_norm": 0.259765625, "kl": 0.042252736166119576, "learning_rate": 1.5944700460829495e-05, "loss": 0.0017, "num_tokens": 7494641.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 477.25, "completions/mean_terminated_length": 477.25, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.15993359158826784, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.01356708921957761, "learning_rate": 1.5963133640552996e-05, "loss": 0.0005, "num_tokens": 7505811.0, "reward": 1.138157844543457, "reward_std": 0.055824220180511475, "rewards/fixed_code_pass_all_test_reward/mean": 0.1381578892469406, "rewards/fixed_code_pass_all_test_reward/std": 0.055824216455221176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 388.625, "completions/mean_terminated_length": 388.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.16011805939863494, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.02838135918136686, "learning_rate": 1.59815668202765e-05, "loss": 0.0011, "num_tokens": 7512560.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.16030252720900204, "frac_reward_zero_std": 1.0, "grad_norm": 0.2275390625, "kl": 0.045360136311501265, "learning_rate": 1.6000000000000003e-05, "loss": 0.0018, "num_tokens": 7519905.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1604869950193691, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.06809098657686263, "learning_rate": 1.6018433179723504e-05, "loss": 0.0027, "num_tokens": 7526606.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.1606714628297362, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.0310385919874534, "learning_rate": 1.6036866359447006e-05, "loss": 0.0012, "num_tokens": 7533490.0, "reward": 1.7872340679168701, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7872340679168701, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 433.125, "completions/mean_terminated_length": 202.42857360839844, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1608559306401033, "frac_reward_zero_std": 0.0, "grad_norm": 0.56640625, "kl": 0.015134699031477794, "learning_rate": 1.6055299539170507e-05, "loss": 0.0006, "num_tokens": 7542339.0, "reward": 1.6111111640930176, "reward_std": 0.6542045474052429, "rewards/fixed_code_pass_all_test_reward/mean": 0.7361111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.30441105365753174, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 391.5, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.16104039845047038, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.017901247541885823, "learning_rate": 1.6073732718894012e-05, "loss": 0.0007, "num_tokens": 7550871.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 429.875, "completions/mean_terminated_length": 429.875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.16122486626083748, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.01244936982402578, "learning_rate": 1.6092165898617514e-05, "loss": 0.0005, "num_tokens": 7560318.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.16140933407120459, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.026560049154795706, "learning_rate": 1.6110599078341015e-05, "loss": 0.0011, "num_tokens": 7567487.0, "reward": 1.0403225421905518, "reward_std": 0.033390238881111145, "rewards/fixed_code_pass_all_test_reward/mean": 0.04032257944345474, "rewards/fixed_code_pass_all_test_reward/std": 0.03339026868343353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 529.125, "completions/mean_terminated_length": 529.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.16159380188157166, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.015643664402887225, "learning_rate": 1.6129032258064517e-05, "loss": 0.0006, "num_tokens": 7579256.0, "reward": 1.8660714626312256, "reward_std": 0.09518492966890335, "rewards/fixed_code_pass_all_test_reward/mean": 0.8660714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.09518493711948395, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.16177826969193876, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.01870645908638835, "learning_rate": 1.614746543778802e-05, "loss": 0.0007, "num_tokens": 7585903.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.16196273750230586, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.04789177072234452, "learning_rate": 1.6165898617511523e-05, "loss": 0.0019, "num_tokens": 7596090.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.16214720531267293, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.023532550549134612, "learning_rate": 1.6184331797235025e-05, "loss": 0.0009, "num_tokens": 7603086.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.16233167312304003, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.05211366293951869, "learning_rate": 1.6202764976958526e-05, "loss": 0.0021, "num_tokens": 7612193.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.16251614093340713, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.04060445830691606, "learning_rate": 1.622119815668203e-05, "loss": 0.0016, "num_tokens": 7619060.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1627006087437742, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.03118653572164476, "learning_rate": 1.623963133640553e-05, "loss": 0.0012, "num_tokens": 7625497.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1628850765541413, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.007208866882137954, "learning_rate": 1.6258064516129034e-05, "loss": 0.0003, "num_tokens": 7631334.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 118.25, "completions/mean_terminated_length": 118.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1630695443645084, "frac_reward_zero_std": 0.0, "grad_norm": 3.078125, "kl": 0.025451550958678126, "learning_rate": 1.6276497695852536e-05, "loss": 0.001, "num_tokens": 7635096.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 447.125, "completions/mean_terminated_length": 218.42857360839844, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.16325401217487548, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.02505898952949792, "learning_rate": 1.6294930875576037e-05, "loss": 0.001, "num_tokens": 7643441.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.16343847998524258, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.019641457591205835, "learning_rate": 1.6313364055299542e-05, "loss": 0.0008, "num_tokens": 7650422.0, "reward": 1.51630437374115, "reward_std": 0.4275406301021576, "rewards/fixed_code_pass_all_test_reward/mean": 0.5163043141365051, "rewards/fixed_code_pass_all_test_reward/std": 0.4275406002998352, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 380.25, "completions/mean_terminated_length": 380.25, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.16362294779560965, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.01911249232944101, "learning_rate": 1.6331797235023044e-05, "loss": 0.0008, "num_tokens": 7661544.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.16380741560597675, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.026515733799897134, "learning_rate": 1.6350230414746545e-05, "loss": 0.0011, "num_tokens": 7672989.0, "reward": 1.7777776718139648, "reward_std": 0.31426966190338135, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.16399188341634385, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.02514504559803754, "learning_rate": 1.6368663594470047e-05, "loss": 0.001, "num_tokens": 7682652.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.16417635122671093, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.015608435205649585, "learning_rate": 1.638709677419355e-05, "loss": 0.0006, "num_tokens": 7693236.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.16436081903707803, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.017180391238071024, "learning_rate": 1.6405529953917053e-05, "loss": 0.0007, "num_tokens": 7705065.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 498.125, "completions/mean_terminated_length": 498.125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.16454528684744513, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.0068995543697383255, "learning_rate": 1.6423963133640555e-05, "loss": 0.0003, "num_tokens": 7714090.0, "reward": 1.971153736114502, "reward_std": 0.03981149569153786, "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.039811473339796066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.1647297546578122, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.022971801343373954, "learning_rate": 1.6442396313364056e-05, "loss": 0.0009, "num_tokens": 7721530.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.1649142224681793, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.01748776837484911, "learning_rate": 1.6460829493087558e-05, "loss": 0.0007, "num_tokens": 7728403.0, "reward": 1.3499999046325684, "reward_std": 0.1414213478565216, "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.1650986902785464, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.021252246340736747, "learning_rate": 1.647926267281106e-05, "loss": 0.0009, "num_tokens": 7736593.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 605.375, "completions/mean_terminated_length": 605.375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.16528315808891347, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.010175477509619668, "learning_rate": 1.6497695852534564e-05, "loss": 0.0004, "num_tokens": 7752908.0, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.16546762589928057, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03583967941813171, "learning_rate": 1.6516129032258066e-05, "loss": 0.0014, "num_tokens": 7760897.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 417.375, "completions/mean_terminated_length": 417.375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.16565209370964767, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.014887030585668981, "learning_rate": 1.6534562211981567e-05, "loss": 0.0006, "num_tokens": 7769916.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 440.5, "completions/mean_terminated_length": 440.5, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.16583656152001475, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.02132390032056719, "learning_rate": 1.6552995391705072e-05, "loss": 0.0009, "num_tokens": 7779472.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 645.25, "completions/mean_terminated_length": 645.25, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.16602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.012361459608655423, "learning_rate": 1.6571428571428574e-05, "loss": 0.0005, "num_tokens": 7795722.0, "reward": 1.1141974925994873, "reward_std": 0.09465660154819489, "rewards/fixed_code_pass_all_test_reward/mean": 0.11419752240180969, "rewards/fixed_code_pass_all_test_reward/std": 0.09465659409761429, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.16620549714074895, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.015072114358190447, "learning_rate": 1.6589861751152075e-05, "loss": 0.0006, "num_tokens": 7801112.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.16638996495111602, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.018616642453707755, "learning_rate": 1.6608294930875577e-05, "loss": 0.0007, "num_tokens": 7805650.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.16657443276148312, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.0367014838848263, "learning_rate": 1.662672811059908e-05, "loss": 0.0015, "num_tokens": 7811910.0, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.16675890057185022, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.040688263485208154, "learning_rate": 1.6645161290322583e-05, "loss": 0.0016, "num_tokens": 7821812.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 449.875, "completions/mean_terminated_length": 449.875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.1669433683822173, "frac_reward_zero_std": 0.0, "grad_norm": 0.63671875, "kl": 0.01776444015558809, "learning_rate": 1.6663594470046085e-05, "loss": 0.0007, "num_tokens": 7831315.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.1671278361925844, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.021341740270145237, "learning_rate": 1.6682027649769587e-05, "loss": 0.0009, "num_tokens": 7838448.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1673123040029515, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.043569871108047664, "learning_rate": 1.6700460829493088e-05, "loss": 0.0017, "num_tokens": 7846448.0, "reward": 1.7364864349365234, "reward_std": 0.36368322372436523, "rewards/fixed_code_pass_all_test_reward/mean": 0.7364864349365234, "rewards/fixed_code_pass_all_test_reward/std": 0.36368322372436523, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 519.125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.16749677181331857, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.009683635624242015, "learning_rate": 1.671889400921659e-05, "loss": 0.0004, "num_tokens": 7855945.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 473.5, "completions/mean_terminated_length": 248.57144165039062, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.16768123962368567, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.023402717532007955, "learning_rate": 1.6737327188940095e-05, "loss": 0.0009, "num_tokens": 7864005.0, "reward": 0.84375, "reward_std": 0.5240969061851501, "rewards/fixed_code_pass_all_test_reward/mean": 0.09375, "rewards/fixed_code_pass_all_test_reward/std": 0.08258593827486038, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.16786570743405277, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.03412551339715719, "learning_rate": 1.6755760368663596e-05, "loss": 0.0014, "num_tokens": 7868161.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.16805017524441984, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04721176717430353, "learning_rate": 1.6774193548387098e-05, "loss": 0.0019, "num_tokens": 7875724.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.16823464305478694, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.017231334350071847, "learning_rate": 1.6792626728110603e-05, "loss": 0.0007, "num_tokens": 7884198.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.16841911086515404, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.04926277091726661, "learning_rate": 1.68110599078341e-05, "loss": 0.002, "num_tokens": 7892316.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.16860357867552112, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.0403216271661222, "learning_rate": 1.6829493087557606e-05, "loss": 0.0016, "num_tokens": 7901029.0, "reward": 1.7999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.16878804648588822, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.025000788504257798, "learning_rate": 1.6847926267281107e-05, "loss": 0.001, "num_tokens": 7908637.0, "reward": 1.4700000286102295, "reward_std": 0.3870769441127777, "rewards/fixed_code_pass_all_test_reward/mean": 0.4700000286102295, "rewards/fixed_code_pass_all_test_reward/std": 0.3870770037174225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.16897251429625532, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.016198983183130622, "learning_rate": 1.686635944700461e-05, "loss": 0.0006, "num_tokens": 7913836.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1691569821066224, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.03361594758462161, "learning_rate": 1.6884792626728114e-05, "loss": 0.0013, "num_tokens": 7923414.0, "reward": 1.3068182468414307, "reward_std": 0.3070886433124542, "rewards/fixed_code_pass_all_test_reward/mean": 0.3068181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.3070886433124542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1693414499169895, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.027360130450688303, "learning_rate": 1.6903225806451615e-05, "loss": 0.0011, "num_tokens": 7930899.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.1695259177273566, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.013544372864998877, "learning_rate": 1.6921658986175117e-05, "loss": 0.0005, "num_tokens": 7936215.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1493.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 960.75, "completions/mean_terminated_length": 960.75, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.16971038553772366, "frac_reward_zero_std": 0.0, "grad_norm": 0.42578125, "kl": 0.008179209427908063, "learning_rate": 1.6940092165898618e-05, "loss": 0.0003, "num_tokens": 7956581.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.16989485334809076, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.04175469046458602, "learning_rate": 1.695852534562212e-05, "loss": 0.0017, "num_tokens": 7965207.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.17007932115845784, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.02670258254511282, "learning_rate": 1.6976958525345625e-05, "loss": 0.0011, "num_tokens": 7974786.0, "reward": 1.7025001049041748, "reward_std": 0.3619687855243683, "rewards/fixed_code_pass_all_test_reward/mean": 0.7024999856948853, "rewards/fixed_code_pass_all_test_reward/std": 0.36196884512901306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.17026378896882494, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.017540586995892227, "learning_rate": 1.6995391705069126e-05, "loss": 0.0007, "num_tokens": 7979670.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.17044825677919204, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.021959692589007318, "learning_rate": 1.7013824884792628e-05, "loss": 0.0009, "num_tokens": 7985315.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.1706327245895591, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0193521564360708, "learning_rate": 1.703225806451613e-05, "loss": 0.0008, "num_tokens": 7993415.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 420.75, "completions/mean_terminated_length": 420.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.1708171923999262, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.03135782788740471, "learning_rate": 1.705069124423963e-05, "loss": 0.0013, "num_tokens": 8000829.0, "reward": 1.6378676891326904, "reward_std": 0.33580029010772705, "rewards/fixed_code_pass_all_test_reward/mean": 0.7628676891326904, "rewards/fixed_code_pass_all_test_reward/std": 0.1857566386461258, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.1710016602102933, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.02334133314434439, "learning_rate": 1.7069124423963136e-05, "loss": 0.0009, "num_tokens": 8009727.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.17118612802066038, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.03564389212988317, "learning_rate": 1.7087557603686637e-05, "loss": 0.0014, "num_tokens": 8019762.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 426.875, "completions/mean_terminated_length": 426.875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.17137059583102748, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.03187166718998924, "learning_rate": 1.710599078341014e-05, "loss": 0.0013, "num_tokens": 8027985.0, "reward": 1.25, "reward_std": 0.13363061845302582, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.17155506364139458, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.023763384553603828, "learning_rate": 1.7124423963133644e-05, "loss": 0.001, "num_tokens": 8038664.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.17173953145176166, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02721288026077673, "learning_rate": 1.7142857142857142e-05, "loss": 0.0011, "num_tokens": 8046673.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 421.5, "completions/mean_terminated_length": 189.1428680419922, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.17192399926212876, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.018199846977950074, "learning_rate": 1.7161290322580647e-05, "loss": 0.0007, "num_tokens": 8052885.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 363.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.17210846707249586, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.016387610288802534, "learning_rate": 1.7179723502304148e-05, "loss": 0.0007, "num_tokens": 8059877.0, "reward": 1.404411792755127, "reward_std": 0.10218808799982071, "rewards/fixed_code_pass_all_test_reward/mean": 0.40441176295280457, "rewards/fixed_code_pass_all_test_reward/std": 0.1021881252527237, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 514.5, "completions/mean_terminated_length": 514.5, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.17229293488286293, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.0280854522716254, "learning_rate": 1.719815668202765e-05, "loss": 0.0011, "num_tokens": 8073113.0, "reward": 1.6428570747375488, "reward_std": 0.4948716461658478, "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.49487167596817017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.17247740269323003, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.02991078153718263, "learning_rate": 1.7216589861751155e-05, "loss": 0.0012, "num_tokens": 8081894.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 313.375, "completions/mean_terminated_length": 313.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.17266187050359713, "frac_reward_zero_std": 1.0, "grad_norm": 0.2001953125, "kl": 0.045928434701636434, "learning_rate": 1.7235023041474656e-05, "loss": 0.0018, "num_tokens": 8091401.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.1728463383139642, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.03130838752258569, "learning_rate": 1.7253456221198158e-05, "loss": 0.0013, "num_tokens": 8101128.0, "reward": 1.2804054021835327, "reward_std": 0.05393325537443161, "rewards/fixed_code_pass_all_test_reward/mean": 0.2804054021835327, "rewards/fixed_code_pass_all_test_reward/std": 0.05393326282501221, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 800.75, "completions/mean_terminated_length": 800.75, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.1730308061243313, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.008159483026247472, "learning_rate": 1.727188940092166e-05, "loss": 0.0003, "num_tokens": 8116486.0, "reward": 1.5119047164916992, "reward_std": 0.23363162577152252, "rewards/fixed_code_pass_all_test_reward/mean": 0.511904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.23363162577152252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 378.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1732152739346984, "frac_reward_zero_std": 1.0, "grad_norm": 0.060791015625, "kl": 0.01692582934629172, "learning_rate": 1.729032258064516e-05, "loss": 0.0007, "num_tokens": 8126068.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 499.875, "completions/mean_terminated_length": 499.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.17339974174506548, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.02368154004216194, "learning_rate": 1.7308755760368666e-05, "loss": 0.0009, "num_tokens": 8137859.0, "reward": 1.5297619104385376, "reward_std": 0.7361450791358948, "rewards/fixed_code_pass_all_test_reward/mean": 0.6547619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.47941088676452637, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.17358420955543258, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.029777121846564114, "learning_rate": 1.7327188940092167e-05, "loss": 0.0012, "num_tokens": 8142756.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 551.5, "completions/mean_terminated_length": 551.5, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.17376867736579968, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.020690912671852857, "learning_rate": 1.734562211981567e-05, "loss": 0.0008, "num_tokens": 8154976.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 357.625, "completions/mean_terminated_length": 357.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.17395314517616675, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.028870258829556406, "learning_rate": 1.736405529953917e-05, "loss": 0.0012, "num_tokens": 8162413.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.17413761298653385, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.029334601247683167, "learning_rate": 1.7382488479262672e-05, "loss": 0.0012, "num_tokens": 8172324.0, "reward": 1.68478262424469, "reward_std": 0.31759119033813477, "rewards/fixed_code_pass_all_test_reward/mean": 0.6847826242446899, "rewards/fixed_code_pass_all_test_reward/std": 0.31759122014045715, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 405.0, "completions/mean_terminated_length": 405.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.17432208079690095, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.01651646476238966, "learning_rate": 1.7400921658986177e-05, "loss": 0.0007, "num_tokens": 8181116.0, "reward": 1.0099999904632568, "reward_std": 0.028284268453717232, "rewards/fixed_code_pass_all_test_reward/mean": 0.009999999776482582, "rewards/fixed_code_pass_all_test_reward/std": 0.02828427031636238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 652.625, "completions/mean_terminated_length": 652.625, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.17450654860726802, "frac_reward_zero_std": 0.0, "grad_norm": 0.53515625, "kl": 0.0072012043674476445, "learning_rate": 1.741935483870968e-05, "loss": 0.0003, "num_tokens": 8190713.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.17469101641763513, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.02335134509485215, "learning_rate": 1.743778801843318e-05, "loss": 0.0009, "num_tokens": 8195569.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 428.5, "completions/mean_terminated_length": 428.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.17487548422800223, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.01473669579718262, "learning_rate": 1.7456221198156685e-05, "loss": 0.0006, "num_tokens": 8207597.0, "reward": 1.451388955116272, "reward_std": 0.5865839123725891, "rewards/fixed_code_pass_all_test_reward/mean": 0.5763888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.23323412239551544, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1750599520383693, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.03596255858428776, "learning_rate": 1.7474654377880186e-05, "loss": 0.0014, "num_tokens": 8212958.0, "reward": 1.3392858505249023, "reward_std": 0.32448652386665344, "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.3244866132736206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 262.75, "completions/mean_terminated_length": 262.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.1752444198487364, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02906238113064319, "learning_rate": 1.7493087557603688e-05, "loss": 0.0012, "num_tokens": 8219676.0, "reward": 1.5185184478759766, "reward_std": 0.40668532252311707, "rewards/fixed_code_pass_all_test_reward/mean": 0.5185185670852661, "rewards/fixed_code_pass_all_test_reward/std": 0.4066852927207947, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 275.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1754288876591035, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.03594231209717691, "learning_rate": 1.751152073732719e-05, "loss": 0.0014, "num_tokens": 8226247.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.17561335546947057, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0609193128766492, "learning_rate": 1.7529953917050694e-05, "loss": 0.0024, "num_tokens": 8235454.0, "reward": 1.7635540962219238, "reward_std": 0.046856433153152466, "rewards/fixed_code_pass_all_test_reward/mean": 0.7635542154312134, "rewards/fixed_code_pass_all_test_reward/std": 0.046856485307216644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.17579782327983767, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.045941002666950226, "learning_rate": 1.7548387096774196e-05, "loss": 0.0018, "num_tokens": 8241254.0, "reward": 1.642045497894287, "reward_std": 0.20535555481910706, "rewards/fixed_code_pass_all_test_reward/mean": 0.6420454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.20535553991794586, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.17598229109020475, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.08620562660507858, "learning_rate": 1.7566820276497697e-05, "loss": 0.0034, "num_tokens": 8244915.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.17616675890057185, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.022723793517798185, "learning_rate": 1.75852534562212e-05, "loss": 0.0009, "num_tokens": 8254203.0, "reward": 1.7272727489471436, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.17635122671093895, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.020114279235713184, "learning_rate": 1.76036866359447e-05, "loss": 0.0008, "num_tokens": 8258561.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.17653569452130602, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.050007963087409735, "learning_rate": 1.7622119815668205e-05, "loss": 0.002, "num_tokens": 8265960.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.17672016233167312, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.049208341632038355, "learning_rate": 1.7640552995391707e-05, "loss": 0.002, "num_tokens": 8273975.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.17690463014204022, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.017374115181155503, "learning_rate": 1.765898617511521e-05, "loss": 0.0007, "num_tokens": 8281676.0, "reward": 1.390625, "reward_std": 0.2471940815448761, "rewards/fixed_code_pass_all_test_reward/mean": 0.390625, "rewards/fixed_code_pass_all_test_reward/std": 0.2471940815448761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.1770890979524073, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.02180948620662093, "learning_rate": 1.7677419354838713e-05, "loss": 0.0009, "num_tokens": 8286834.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 411.375, "completions/mean_terminated_length": 411.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1772735657627744, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.017014401382766664, "learning_rate": 1.7695852534562215e-05, "loss": 0.0007, "num_tokens": 8296205.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.1774580335731415, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.03613136557396501, "learning_rate": 1.7714285714285717e-05, "loss": 0.0014, "num_tokens": 8300599.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.17764250138350857, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.0538258976303041, "learning_rate": 1.7732718894009218e-05, "loss": 0.0022, "num_tokens": 8309220.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.17782696919387567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.03132734901737422, "learning_rate": 1.775115207373272e-05, "loss": 0.0013, "num_tokens": 8317369.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 361.75, "completions/mean_terminated_length": 361.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.17801143700424277, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.033077884931117296, "learning_rate": 1.7769585253456225e-05, "loss": 0.0013, "num_tokens": 8324839.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.17819590481460984, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.045198181411251426, "learning_rate": 1.7788018433179726e-05, "loss": 0.0018, "num_tokens": 8330933.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.17838037262497694, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.03309439099393785, "learning_rate": 1.7806451612903228e-05, "loss": 0.0013, "num_tokens": 8337457.0, "reward": 1.2635869979858398, "reward_std": 0.3847690224647522, "rewards/fixed_code_pass_all_test_reward/mean": 0.26358693838119507, "rewards/fixed_code_pass_all_test_reward/std": 0.3847690224647522, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 665.625, "completions/mean_terminated_length": 665.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.17856484043534404, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.029766413033939898, "learning_rate": 1.782488479262673e-05, "loss": 0.0012, "num_tokens": 8353790.0, "reward": 1.2000000476837158, "reward_std": 0.35456210374832153, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.3545621335506439, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.1787493082457111, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.016769222274888307, "learning_rate": 1.784331797235023e-05, "loss": 0.0007, "num_tokens": 8360333.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 483.75, "completions/mean_terminated_length": 483.75, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.17893377605607821, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.02423131396062672, "learning_rate": 1.7861751152073736e-05, "loss": 0.001, "num_tokens": 8370371.0, "reward": 1.1071428060531616, "reward_std": 0.14787116646766663, "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.17911824386644531, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.031280478346161544, "learning_rate": 1.7880184331797237e-05, "loss": 0.0013, "num_tokens": 8377446.0, "reward": 1.3671875, "reward_std": 0.39058035612106323, "rewards/fixed_code_pass_all_test_reward/mean": 0.3671875, "rewards/fixed_code_pass_all_test_reward/std": 0.39058035612106323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 595.625, "completions/mean_terminated_length": 595.625, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.1793027116768124, "frac_reward_zero_std": 0.0, "grad_norm": 0.58984375, "kl": 0.010642128705512732, "learning_rate": 1.789861751152074e-05, "loss": 0.0004, "num_tokens": 8391331.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1794871794871795, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.031383720925077796, "learning_rate": 1.7917050691244244e-05, "loss": 0.0013, "num_tokens": 8396233.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1796716472975466, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.028886747430078685, "learning_rate": 1.7935483870967742e-05, "loss": 0.0012, "num_tokens": 8402630.0, "reward": 1.2347561120986938, "reward_std": 0.10093256831169128, "rewards/fixed_code_pass_all_test_reward/mean": 0.23475609719753265, "rewards/fixed_code_pass_all_test_reward/std": 0.10093259811401367, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.17985611510791366, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.04797433433122933, "learning_rate": 1.7953917050691247e-05, "loss": 0.0019, "num_tokens": 8408325.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.18004058291828076, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.03495860635302961, "learning_rate": 1.7972350230414748e-05, "loss": 0.0014, "num_tokens": 8414248.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.18022505072864786, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.020019241026602685, "learning_rate": 1.799078341013825e-05, "loss": 0.0008, "num_tokens": 8423828.0, "reward": 1.5803570747375488, "reward_std": 0.09687156975269318, "rewards/fixed_code_pass_all_test_reward/mean": 0.5803571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.09687161445617676, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.18040951853901493, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.013790995057206601, "learning_rate": 1.8009216589861755e-05, "loss": 0.0006, "num_tokens": 8430670.0, "reward": 1.375, "reward_std": 0.3857583999633789, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.3857583999633789, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.18059398634938204, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.02370634861290455, "learning_rate": 1.8027649769585256e-05, "loss": 0.0009, "num_tokens": 8437844.0, "reward": 1.5892856121063232, "reward_std": 0.1415758579969406, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1415759027004242, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.18077845415974914, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.023717242409475148, "learning_rate": 1.8046082949308758e-05, "loss": 0.0009, "num_tokens": 8441916.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.1809629219701162, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.038024751702323556, "learning_rate": 1.806451612903226e-05, "loss": 0.0015, "num_tokens": 8448047.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.1811473897804833, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.052003409480676055, "learning_rate": 1.808294930875576e-05, "loss": 0.0021, "num_tokens": 8455935.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1813318575908504, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.048348663724027574, "learning_rate": 1.8101382488479266e-05, "loss": 0.0019, "num_tokens": 8462783.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.18151632540121748, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.039190820360090584, "learning_rate": 1.8119815668202767e-05, "loss": 0.0016, "num_tokens": 8467469.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 327.0, "completions/mean_terminated_length": 327.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.18170079321158458, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.14731870009563863, "learning_rate": 1.813824884792627e-05, "loss": 0.0059, "num_tokens": 8476341.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 409.125, "completions/mean_terminated_length": 409.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.18188526102195168, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.03180694949696772, "learning_rate": 1.815668202764977e-05, "loss": 0.0013, "num_tokens": 8488214.0, "reward": 1.1315789222717285, "reward_std": 0.05626552551984787, "rewards/fixed_code_pass_all_test_reward/mean": 0.1315789520740509, "rewards/fixed_code_pass_all_test_reward/std": 0.05626552179455757, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.18206972883231876, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.02653311169706285, "learning_rate": 1.8175115207373272e-05, "loss": 0.0011, "num_tokens": 8495040.0, "reward": 1.942307710647583, "reward_std": 0.061675652861595154, "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.06167568638920784, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 423.25, "completions/mean_terminated_length": 423.25, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.18225419664268586, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.014761926722712815, "learning_rate": 1.8193548387096777e-05, "loss": 0.0006, "num_tokens": 8503658.0, "reward": 1.7797619104385376, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.18243866445305293, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.03590187546797097, "learning_rate": 1.8211981566820278e-05, "loss": 0.0014, "num_tokens": 8512146.0, "reward": 1.0833332538604736, "reward_std": 0.10212231427431107, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.10212230682373047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.18262313226342003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.01883467345032841, "learning_rate": 1.823041474654378e-05, "loss": 0.0008, "num_tokens": 8516755.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.18280760007378713, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.04174447455443442, "learning_rate": 1.8248847926267285e-05, "loss": 0.0017, "num_tokens": 8525993.0, "reward": 0.8125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1829920678841542, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.03661162592470646, "learning_rate": 1.8267281105990783e-05, "loss": 0.0015, "num_tokens": 8531852.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 888.375, "completions/mean_terminated_length": 888.375, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.1831765356945213, "frac_reward_zero_std": 0.0, "grad_norm": 0.54296875, "kl": 0.011283142201136798, "learning_rate": 1.8285714285714288e-05, "loss": 0.0005, "num_tokens": 8547615.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 464.125, "completions/mean_terminated_length": 464.125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.1833610035048884, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04849442606791854, "learning_rate": 1.830414746543779e-05, "loss": 0.0019, "num_tokens": 8556472.0, "reward": 0.6607142686843872, "reward_std": 0.5555838942527771, "rewards/fixed_code_pass_all_test_reward/mean": 0.0357142873108387, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 366.875, "completions/mean_terminated_length": 366.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.18354547131525548, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04402253753505647, "learning_rate": 1.832258064516129e-05, "loss": 0.0018, "num_tokens": 8566039.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 455.625, "completions/mean_terminated_length": 455.625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.18372993912562258, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.008188151550712064, "learning_rate": 1.8341013824884796e-05, "loss": 0.0003, "num_tokens": 8579276.0, "reward": 1.8099173307418823, "reward_std": 0.20320695638656616, "rewards/fixed_code_pass_all_test_reward/mean": 0.8099173307418823, "rewards/fixed_code_pass_all_test_reward/std": 0.2032068967819214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 634.375, "completions/mean_terminated_length": 634.375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.18391440693598968, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.03221911733271554, "learning_rate": 1.8359447004608297e-05, "loss": 0.0013, "num_tokens": 8594399.0, "reward": 1.6534091234207153, "reward_std": 0.6409562826156616, "rewards/fixed_code_pass_all_test_reward/mean": 0.7784091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.39285045862197876, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 331.875, "completions/mean_terminated_length": 331.875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.18409887474635675, "frac_reward_zero_std": 1.0, "grad_norm": 0.60546875, "kl": 0.07009333209134638, "learning_rate": 1.83778801843318e-05, "loss": 0.0028, "num_tokens": 8601622.0, "reward": 1.2222222089767456, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2222222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 1154.625, "completions/mean_terminated_length": 1027.0, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.18428334255672385, "frac_reward_zero_std": 0.0, "grad_norm": 0.478515625, "kl": 0.00859526326530613, "learning_rate": 1.83963133640553e-05, "loss": 0.0003, "num_tokens": 8620883.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 475.875, "completions/mean_terminated_length": 475.875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.18446781036709095, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.03287659399211407, "learning_rate": 1.8414746543778802e-05, "loss": 0.0013, "num_tokens": 8630058.0, "reward": 1.807692289352417, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.18465227817745802, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.01720403239596635, "learning_rate": 1.8433179723502307e-05, "loss": 0.0007, "num_tokens": 8638497.0, "reward": 1.8790322542190552, "reward_std": 0.34214845299720764, "rewards/fixed_code_pass_all_test_reward/mean": 0.8790322542190552, "rewards/fixed_code_pass_all_test_reward/std": 0.34214845299720764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.18483674598782512, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.03713111043907702, "learning_rate": 1.845161290322581e-05, "loss": 0.0015, "num_tokens": 8646487.0, "reward": 1.9147727489471436, "reward_std": 0.20535552501678467, "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.20535555481910706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.18502121379819222, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.05610498250462115, "learning_rate": 1.847004608294931e-05, "loss": 0.0022, "num_tokens": 8656250.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1852056816085593, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.03201098367571831, "learning_rate": 1.848847926267281e-05, "loss": 0.0013, "num_tokens": 8664434.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1853901494189264, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.02570451988140121, "learning_rate": 1.8506912442396313e-05, "loss": 0.001, "num_tokens": 8670856.0, "reward": 1.6881721019744873, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6881720423698425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.1855746172292935, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.029941468965262175, "learning_rate": 1.8525345622119818e-05, "loss": 0.0012, "num_tokens": 8676534.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.18575908503966057, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.026903101592324674, "learning_rate": 1.854377880184332e-05, "loss": 0.0011, "num_tokens": 8688949.0, "reward": 1.966292142868042, "reward_std": 0.06241482123732567, "rewards/fixed_code_pass_all_test_reward/mean": 0.966292142868042, "rewards/fixed_code_pass_all_test_reward/std": 0.062414851039648056, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 554.875, "completions/mean_terminated_length": 554.875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.18594355285002767, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.02123036654666066, "learning_rate": 1.856221198156682e-05, "loss": 0.0008, "num_tokens": 8708660.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.18612802066039477, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.029438258847221732, "learning_rate": 1.8580645161290326e-05, "loss": 0.0012, "num_tokens": 8720059.0, "reward": 1.402438998222351, "reward_std": 0.20322878658771515, "rewards/fixed_code_pass_all_test_reward/mean": 0.40243905782699585, "rewards/fixed_code_pass_all_test_reward/std": 0.20322881639003754, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.18631248847076184, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.033820669865235686, "learning_rate": 1.8599078341013824e-05, "loss": 0.0014, "num_tokens": 8724705.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.18649695628112894, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.05939379730261862, "learning_rate": 1.861751152073733e-05, "loss": 0.0024, "num_tokens": 8733606.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.18668142409149605, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.038287939270958304, "learning_rate": 1.863594470046083e-05, "loss": 0.0015, "num_tokens": 8741323.0, "reward": 1.6736111640930176, "reward_std": 0.3493979275226593, "rewards/fixed_code_pass_all_test_reward/mean": 0.6736111640930176, "rewards/fixed_code_pass_all_test_reward/std": 0.3493979573249817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.18686589190186312, "frac_reward_zero_std": 0.0, "grad_norm": 0.6171875, "kl": 0.029156034695915878, "learning_rate": 1.8654377880184332e-05, "loss": 0.0012, "num_tokens": 8751515.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.18705035971223022, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05682164872996509, "learning_rate": 1.8672811059907837e-05, "loss": 0.0023, "num_tokens": 8759281.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 523.625, "completions/mean_terminated_length": 523.625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.18723482752259732, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.05350975785404444, "learning_rate": 1.869124423963134e-05, "loss": 0.0021, "num_tokens": 8769398.0, "reward": 1.3318965435028076, "reward_std": 0.13410648703575134, "rewards/fixed_code_pass_all_test_reward/mean": 0.3318965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.13410645723342896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.1874192953329644, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.03424076386727393, "learning_rate": 1.870967741935484e-05, "loss": 0.0014, "num_tokens": 8773268.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.1876037631433315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.02082297159358859, "learning_rate": 1.872811059907834e-05, "loss": 0.0008, "num_tokens": 8778944.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.1877882309536986, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.026023157639428973, "learning_rate": 1.8746543778801843e-05, "loss": 0.001, "num_tokens": 8789269.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.18797269876406567, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.034676977433264256, "learning_rate": 1.8764976958525348e-05, "loss": 0.0014, "num_tokens": 8795116.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 434.75, "completions/mean_terminated_length": 434.75, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.18815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.02111108449753374, "learning_rate": 1.878341013824885e-05, "loss": 0.0008, "num_tokens": 8801842.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.18834163438479984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.023721578298136592, "learning_rate": 1.880184331797235e-05, "loss": 0.0009, "num_tokens": 8808060.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 396.875, "completions/mean_terminated_length": 396.875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.18852610219516694, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.04490457125939429, "learning_rate": 1.8820276497695853e-05, "loss": 0.0018, "num_tokens": 8820187.0, "reward": 1.6363636255264282, "reward_std": 0.2571297287940979, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 606.75, "completions/mean_terminated_length": 606.75, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.18871057000553404, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.01960376021452248, "learning_rate": 1.8838709677419354e-05, "loss": 0.0008, "num_tokens": 8832201.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 229.875, "completions/mean_terminated_length": 229.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1888950378159011, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.038473634980618954, "learning_rate": 1.885714285714286e-05, "loss": 0.0015, "num_tokens": 8837048.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 885.625, "completions/mean_terminated_length": 885.625, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.1890795056262682, "frac_reward_zero_std": 0.0, "grad_norm": 0.435546875, "kl": 0.014221677207387984, "learning_rate": 1.887557603686636e-05, "loss": 0.0006, "num_tokens": 8852869.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 367.625, "completions/mean_terminated_length": 367.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.1892639734366353, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.03798526444006711, "learning_rate": 1.8894009216589862e-05, "loss": 0.0015, "num_tokens": 8861650.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 466.75, "completions/mean_terminated_length": 466.75, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.18944844124700239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0498046875, "kl": 0.024337452370673418, "learning_rate": 1.8912442396313367e-05, "loss": 0.001, "num_tokens": 8870176.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 410.875, "completions/mean_terminated_length": 410.875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.1896329090573695, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.051637128461152315, "learning_rate": 1.893087557603687e-05, "loss": 0.0021, "num_tokens": 8877895.0, "reward": 1.3106060028076172, "reward_std": 0.44499343633651733, "rewards/fixed_code_pass_all_test_reward/mean": 0.43560606241226196, "rewards/fixed_code_pass_all_test_reward/std": 0.22903358936309814, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1898173768677366, "frac_reward_zero_std": 1.0, "grad_norm": 0.259765625, "kl": 0.06695012468844652, "learning_rate": 1.894930875576037e-05, "loss": 0.0027, "num_tokens": 8886357.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.19000184467810366, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.024263644823804498, "learning_rate": 1.896774193548387e-05, "loss": 0.001, "num_tokens": 8892802.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 438.0, "completions/mean_terminated_length": 438.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.19018631248847076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.02297888114117086, "learning_rate": 1.8986175115207373e-05, "loss": 0.0009, "num_tokens": 8902466.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.19037078029883786, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04576683440245688, "learning_rate": 1.9004608294930878e-05, "loss": 0.0018, "num_tokens": 8907334.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.19055524810920493, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.028550986666232347, "learning_rate": 1.902304147465438e-05, "loss": 0.0011, "num_tokens": 8911778.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.19073971591957203, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.048959774896502495, "learning_rate": 1.904147465437788e-05, "loss": 0.002, "num_tokens": 8918270.0, "reward": 1.5150001049041748, "reward_std": 0.3342795968055725, "rewards/fixed_code_pass_all_test_reward/mean": 0.5149999856948853, "rewards/fixed_code_pass_all_test_reward/std": 0.3342796266078949, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 337.875, "completions/mean_terminated_length": 337.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.19092418372993913, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.08370061742607504, "learning_rate": 1.9059907834101383e-05, "loss": 0.0033, "num_tokens": 8928165.0, "reward": 1.7884615659713745, "reward_std": 0.3916930854320526, "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.391693115234375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1911086515403062, "frac_reward_zero_std": 1.0, "grad_norm": 0.306640625, "kl": 0.04091053269803524, "learning_rate": 1.9078341013824884e-05, "loss": 0.0016, "num_tokens": 8932136.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 427.125, "completions/mean_terminated_length": 427.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.1912931193506733, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.04178572306409478, "learning_rate": 1.909677419354839e-05, "loss": 0.0017, "num_tokens": 8942313.0, "reward": 1.5625, "reward_std": 0.4172614812850952, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 413.25, "completions/mean_terminated_length": 413.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.1914775871610404, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.016138374689035118, "learning_rate": 1.911520737327189e-05, "loss": 0.0006, "num_tokens": 8952707.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 390.375, "completions/mean_terminated_length": 390.375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.19166205497140748, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.037209371803328395, "learning_rate": 1.9133640552995392e-05, "loss": 0.0015, "num_tokens": 8960054.0, "reward": 1.2805850505828857, "reward_std": 0.01880606822669506, "rewards/fixed_code_pass_all_test_reward/mean": 0.2805851101875305, "rewards/fixed_code_pass_all_test_reward/std": 0.018806030973792076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 478.375, "completions/mean_terminated_length": 478.375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.19184652278177458, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.041813582414761186, "learning_rate": 1.9152073732718897e-05, "loss": 0.0017, "num_tokens": 8971665.0, "reward": 1.7083332538604736, "reward_std": 0.412545770406723, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.41254574060440063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 468.375, "completions/mean_terminated_length": 468.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.19203099059214168, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.026070336112752557, "learning_rate": 1.91705069124424e-05, "loss": 0.001, "num_tokens": 8983484.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.19221545840250875, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.016754469892475754, "learning_rate": 1.91889400921659e-05, "loss": 0.0007, "num_tokens": 8987806.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 272.375, "completions/mean_terminated_length": 272.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.19239992621287585, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.042511930922046304, "learning_rate": 1.9207373271889402e-05, "loss": 0.0017, "num_tokens": 8995737.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.19258439402324296, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.06580156041309237, "learning_rate": 1.9225806451612907e-05, "loss": 0.0026, "num_tokens": 9004881.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 419.125, "completions/mean_terminated_length": 419.125, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.19276886183361003, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.03426528465934098, "learning_rate": 1.9244239631336408e-05, "loss": 0.0014, "num_tokens": 9012754.0, "reward": 1.6304347515106201, "reward_std": 0.27002573013305664, "rewards/fixed_code_pass_all_test_reward/mean": 0.6304347515106201, "rewards/fixed_code_pass_all_test_reward/std": 0.27002567052841187, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.19295332964397713, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.031216715462505817, "learning_rate": 1.926267281105991e-05, "loss": 0.0012, "num_tokens": 9019601.0, "reward": 1.5, "reward_std": 0.360259473323822, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.19093996286392212, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 644.5, "completions/mean_terminated_length": 644.5, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.19313779745434423, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.02768910489976406, "learning_rate": 1.928110599078341e-05, "loss": 0.0011, "num_tokens": 9031093.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 650.125, "completions/mean_terminated_length": 650.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.1933222652647113, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.027823954238556325, "learning_rate": 1.9299539170506913e-05, "loss": 0.0011, "num_tokens": 9044534.0, "reward": 1.6875, "reward_std": 0.42889204621315, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.14847105741500854, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 410.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.1935067330750784, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.026442745584063232, "learning_rate": 1.9317972350230418e-05, "loss": 0.0011, "num_tokens": 9051968.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 566.25, "completions/mean_terminated_length": 566.25, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.1936912008854455, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.035375138744711876, "learning_rate": 1.933640552995392e-05, "loss": 0.0014, "num_tokens": 9062242.0, "reward": 1.1120688915252686, "reward_std": 0.07543544471263885, "rewards/fixed_code_pass_all_test_reward/mean": 0.11206896603107452, "rewards/fixed_code_pass_all_test_reward/std": 0.07543543726205826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 371.125, "completions/mean_terminated_length": 371.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.19387566869581258, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.04167580115608871, "learning_rate": 1.935483870967742e-05, "loss": 0.0017, "num_tokens": 9072155.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 214.75, "completions/mean_terminated_length": 214.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.19406013650617968, "frac_reward_zero_std": 0.0, "grad_norm": 4.5, "kl": 0.17124384664930403, "learning_rate": 1.9373271889400926e-05, "loss": 0.0068, "num_tokens": 9076641.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.19424460431654678, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.056981916539371014, "learning_rate": 1.9391705069124424e-05, "loss": 0.0023, "num_tokens": 9087012.0, "reward": 1.1875, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.19442907212691385, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.031192297814413905, "learning_rate": 1.941013824884793e-05, "loss": 0.0012, "num_tokens": 9091548.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.19461353993728095, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.014351689431350678, "learning_rate": 1.942857142857143e-05, "loss": 0.0006, "num_tokens": 9097245.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.19479800774764802, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.030655331094749272, "learning_rate": 1.9447004608294932e-05, "loss": 0.0012, "num_tokens": 9100955.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.19498247555801512, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.031449867179617286, "learning_rate": 1.9465437788018437e-05, "loss": 0.0013, "num_tokens": 9105444.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.19516694336838222, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.04106221976689994, "learning_rate": 1.948387096774194e-05, "loss": 0.0016, "num_tokens": 9112991.0, "reward": 1.6308139562606812, "reward_std": 0.22077718377113342, "rewards/fixed_code_pass_all_test_reward/mean": 0.6308139562606812, "rewards/fixed_code_pass_all_test_reward/std": 0.22077718377113342, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1953514111787493, "frac_reward_zero_std": 1.0, "grad_norm": 0.4609375, "kl": 0.053116932045668364, "learning_rate": 1.950230414746544e-05, "loss": 0.0021, "num_tokens": 9119100.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1955358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.036159938434138894, "learning_rate": 1.952073732718894e-05, "loss": 0.0014, "num_tokens": 9122994.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1957203467994835, "frac_reward_zero_std": 1.0, "grad_norm": 0.052490234375, "kl": 0.046173508977517486, "learning_rate": 1.9539170506912443e-05, "loss": 0.0018, "num_tokens": 9129401.0, "reward": 1.2872340679168701, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.28723403811454773, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.19590481460985057, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.03306983155198395, "learning_rate": 1.9557603686635948e-05, "loss": 0.0013, "num_tokens": 9137284.0, "reward": 0.7573529481887817, "reward_std": 0.46762460470199585, "rewards/fixed_code_pass_all_test_reward/mean": 0.007352941203862429, "rewards/fixed_code_pass_all_test_reward/std": 0.013615001924335957, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.19608928242021767, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.07243763422593474, "learning_rate": 1.957603686635945e-05, "loss": 0.0029, "num_tokens": 9147275.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.19627375023058477, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.028038288466632366, "learning_rate": 1.959447004608295e-05, "loss": 0.0011, "num_tokens": 9152371.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.19645821804095184, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05310514033772051, "learning_rate": 1.9612903225806452e-05, "loss": 0.0021, "num_tokens": 9164757.0, "reward": 1.0258619785308838, "reward_std": 0.015962397679686546, "rewards/fixed_code_pass_all_test_reward/mean": 0.025862067937850952, "rewards/fixed_code_pass_all_test_reward/std": 0.01596241630613804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.19664268585131894, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.08375988807529211, "learning_rate": 1.9631336405529954e-05, "loss": 0.0034, "num_tokens": 9168731.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.19682715366168604, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.044415622018277645, "learning_rate": 1.964976958525346e-05, "loss": 0.0018, "num_tokens": 9179193.0, "reward": 1.4614661931991577, "reward_std": 0.5035419464111328, "rewards/fixed_code_pass_all_test_reward/mean": 0.4614661633968353, "rewards/fixed_code_pass_all_test_reward/std": 0.5035419464111328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.19701162147205312, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.08485871041193604, "learning_rate": 1.966820276497696e-05, "loss": 0.0034, "num_tokens": 9188704.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.19719608928242022, "frac_reward_zero_std": 1.0, "grad_norm": 0.2578125, "kl": 0.11099720187485218, "learning_rate": 1.9686635944700462e-05, "loss": 0.0044, "num_tokens": 9196498.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.19738055709278732, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.057019088184461, "learning_rate": 1.9705069124423967e-05, "loss": 0.0023, "num_tokens": 9203085.0, "reward": 1.9057971239089966, "reward_std": 0.10070707648992538, "rewards/fixed_code_pass_all_test_reward/mean": 0.9057971239089966, "rewards/fixed_code_pass_all_test_reward/std": 0.10070714354515076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.1975650249031544, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.02561216347385198, "learning_rate": 1.9723502304147465e-05, "loss": 0.001, "num_tokens": 9208636.0, "reward": 1.9375, "reward_std": 0.09449111670255661, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1977494927135215, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.03833519690670073, "learning_rate": 1.974193548387097e-05, "loss": 0.0015, "num_tokens": 9214763.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1979339605238886, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.08201934862881899, "learning_rate": 1.976036866359447e-05, "loss": 0.0033, "num_tokens": 9224806.0, "reward": 1.3684210777282715, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3684210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 457.25, "completions/mean_terminated_length": 457.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.19811842833425566, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04412001185119152, "learning_rate": 1.9778801843317973e-05, "loss": 0.0018, "num_tokens": 9236048.0, "reward": 1.9008619785308838, "reward_std": 0.036574505269527435, "rewards/fixed_code_pass_all_test_reward/mean": 0.9008620381355286, "rewards/fixed_code_pass_all_test_reward/std": 0.03657449036836624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.19830289614462276, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.07448474573902786, "learning_rate": 1.9797235023041478e-05, "loss": 0.003, "num_tokens": 9247186.0, "reward": 1.4969512224197388, "reward_std": 0.09395557641983032, "rewards/fixed_code_pass_all_test_reward/mean": 0.49695122241973877, "rewards/fixed_code_pass_all_test_reward/std": 0.09395559877157211, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 613.5, "completions/mean_terminated_length": 613.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.19848736395498986, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.03945091995410621, "learning_rate": 1.981566820276498e-05, "loss": 0.0016, "num_tokens": 9260358.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.19867183176535694, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.04427030589431524, "learning_rate": 1.983410138248848e-05, "loss": 0.0018, "num_tokens": 9264704.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.19885629957572404, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.1550848176702857, "learning_rate": 1.9852534562211983e-05, "loss": 0.0062, "num_tokens": 9270305.0, "reward": 1.1607142686843872, "reward_std": 0.5634002089500427, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.33284705877304077, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.19904076738609114, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.055282536428421736, "learning_rate": 1.9870967741935484e-05, "loss": 0.0022, "num_tokens": 9274187.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1992252351964582, "frac_reward_zero_std": 1.0, "grad_norm": 0.248046875, "kl": 0.08959878934547305, "learning_rate": 1.988940092165899e-05, "loss": 0.0036, "num_tokens": 9283986.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 155.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1994097030068253, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.1725041288882494, "learning_rate": 1.990783410138249e-05, "loss": 0.0069, "num_tokens": 9292738.0, "reward": 1.7817796468734741, "reward_std": 0.4042939841747284, "rewards/fixed_code_pass_all_test_reward/mean": 0.7817796468734741, "rewards/fixed_code_pass_all_test_reward/std": 0.4042940139770508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1995941708171924, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.08496152609586716, "learning_rate": 1.9926267281105992e-05, "loss": 0.0034, "num_tokens": 9299315.0, "reward": 1.6956522464752197, "reward_std": 0.12297506630420685, "rewards/fixed_code_pass_all_test_reward/mean": 0.6956522464752197, "rewards/fixed_code_pass_all_test_reward/std": 0.12297508865594864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.19977863862755948, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.10247335932217538, "learning_rate": 1.9944700460829494e-05, "loss": 0.0041, "num_tokens": 9307925.0, "reward": 1.7529070377349854, "reward_std": 0.45756959915161133, "rewards/fixed_code_pass_all_test_reward/mean": 0.7529069781303406, "rewards/fixed_code_pass_all_test_reward/std": 0.45756959915161133, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 586.25, "completions/mean_terminated_length": 586.25, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.19996310643792659, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.05561680067330599, "learning_rate": 1.9963133640552995e-05, "loss": 0.0022, "num_tokens": 9319391.0, "reward": 1.7999999523162842, "reward_std": 0.33166247606277466, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.33166250586509705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 349.125, "completions/mean_terminated_length": 349.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.20014757424829369, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.1019435403868556, "learning_rate": 1.99815668202765e-05, "loss": 0.0041, "num_tokens": 9328320.0, "reward": 1.5433673858642578, "reward_std": 0.15308551490306854, "rewards/fixed_code_pass_all_test_reward/mean": 0.5433673858642578, "rewards/fixed_code_pass_all_test_reward/std": 0.15308551490306854, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 396.625, "completions/mean_terminated_length": 396.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.20033204205866076, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.09953207708895206, "learning_rate": 2e-05, "loss": 0.004, "num_tokens": 9337957.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 792.625, "completions/mean_terminated_length": 792.625, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.20051650986902786, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.04149740864522755, "learning_rate": 1.9999999481633253e-05, "loss": 0.0017, "num_tokens": 9354746.0, "reward": 1.53125, "reward_std": 0.5077524185180664, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.20070097767939493, "frac_reward_zero_std": 1.0, "grad_norm": 0.259765625, "kl": 0.11919962940737605, "learning_rate": 1.9999997926533058e-05, "loss": 0.0048, "num_tokens": 9363459.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.20088544548976203, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.05876771756447852, "learning_rate": 1.999999533469958e-05, "loss": 0.0024, "num_tokens": 9371918.0, "reward": 1.3250000476837158, "reward_std": 0.5625198483467102, "rewards/fixed_code_pass_all_test_reward/mean": 0.45000001788139343, "rewards/fixed_code_pass_all_test_reward/std": 0.2507132887840271, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.20106991330012913, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.06345975468866527, "learning_rate": 1.9999991706133083e-05, "loss": 0.0025, "num_tokens": 9385942.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.2012543811104962, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.09511996665969491, "learning_rate": 1.9999987040833953e-05, "loss": 0.0038, "num_tokens": 9393279.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2014388489208633, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.06794558092951775, "learning_rate": 1.9999981338802665e-05, "loss": 0.0027, "num_tokens": 9397347.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2016233167312304, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.05465734051540494, "learning_rate": 1.9999974600039814e-05, "loss": 0.0022, "num_tokens": 9401266.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.20180778454159748, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05780915659852326, "learning_rate": 1.9999966824546095e-05, "loss": 0.0023, "num_tokens": 9409711.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.20199225235196458, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.03219430800527334, "learning_rate": 1.9999958012322317e-05, "loss": 0.0013, "num_tokens": 9414074.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.20217672016233168, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.051416757283732295, "learning_rate": 1.9999948163369395e-05, "loss": 0.0021, "num_tokens": 9422523.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.20236118797269875, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.08728176075965166, "learning_rate": 1.9999937277688347e-05, "loss": 0.0035, "num_tokens": 9426614.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 365.0, "completions/mean_terminated_length": 365.0, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.20254565578306585, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.04686649201903492, "learning_rate": 1.9999925355280302e-05, "loss": 0.0019, "num_tokens": 9437302.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.20273012359343295, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.050774041563272476, "learning_rate": 1.9999912396146495e-05, "loss": 0.002, "num_tokens": 9444812.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 107.0, "completions/mean_terminated_length": 107.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.20291459140380003, "frac_reward_zero_std": 0.0, "grad_norm": 7.125, "kl": 0.13979557622224092, "learning_rate": 1.9999898400288278e-05, "loss": 0.0056, "num_tokens": 9448540.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.20309905921416713, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.060022128745913506, "learning_rate": 1.999988336770709e-05, "loss": 0.0024, "num_tokens": 9457828.0, "reward": 1.655063271522522, "reward_std": 0.05273657664656639, "rewards/fixed_code_pass_all_test_reward/mean": 0.655063271522522, "rewards/fixed_code_pass_all_test_reward/std": 0.05273657664656639, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.20328352702453423, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.05706826550886035, "learning_rate": 1.9999867298404498e-05, "loss": 0.0023, "num_tokens": 9461814.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 140.75, "completions/mean_terminated_length": 140.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2034679948349013, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.06405069655738771, "learning_rate": 1.9999850192382163e-05, "loss": 0.0026, "num_tokens": 9466012.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.2036524626452684, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.07642073556780815, "learning_rate": 1.999983204964186e-05, "loss": 0.0031, "num_tokens": 9472015.0, "reward": 1.8858695030212402, "reward_std": 0.3228096067905426, "rewards/fixed_code_pass_all_test_reward/mean": 0.885869562625885, "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2038369304556355, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.04644873715005815, "learning_rate": 1.999981287018547e-05, "loss": 0.0019, "num_tokens": 9476672.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.20402139826600257, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "kl": 0.05783520103432238, "learning_rate": 1.999979265401498e-05, "loss": 0.0023, "num_tokens": 9486257.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.20420586607636967, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.049751730635762215, "learning_rate": 1.999977140113249e-05, "loss": 0.002, "num_tokens": 9494186.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.20439033388673677, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04597937222570181, "learning_rate": 1.9999749111540197e-05, "loss": 0.0018, "num_tokens": 9502361.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.20457480169710385, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04796504694968462, "learning_rate": 1.9999725785240423e-05, "loss": 0.0019, "num_tokens": 9509524.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.20475926950747095, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.034556994680315256, "learning_rate": 1.9999701422235574e-05, "loss": 0.0014, "num_tokens": 9516303.0, "reward": 1.220588207244873, "reward_std": 0.04159453138709068, "rewards/fixed_code_pass_all_test_reward/mean": 0.22058823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.04159452021121979, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.20494373731783805, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.036939756479114294, "learning_rate": 1.9999676022528178e-05, "loss": 0.0015, "num_tokens": 9524512.0, "reward": 1.0833333730697632, "reward_std": 0.1543033868074417, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 392.625, "completions/mean_terminated_length": 392.625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.20512820512820512, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.024581013014540076, "learning_rate": 1.9999649586120875e-05, "loss": 0.001, "num_tokens": 9538757.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.20531267293857222, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.06351345451548696, "learning_rate": 1.9999622113016402e-05, "loss": 0.0025, "num_tokens": 9543526.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.20549714074893932, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0589489764533937, "learning_rate": 1.9999593603217605e-05, "loss": 0.0024, "num_tokens": 9555188.0, "reward": 1.8499999046325684, "reward_std": 0.29760950803756714, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2976095378398895, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 108.25, "completions/mean_terminated_length": 108.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.2056816085593064, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.0585943921469152, "learning_rate": 1.9999564056727442e-05, "loss": 0.0023, "num_tokens": 9558974.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2058660763696735, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.04888463672250509, "learning_rate": 1.9999533473548976e-05, "loss": 0.002, "num_tokens": 9566255.0, "reward": 1.796875, "reward_std": 0.3761144280433655, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.3761144280433655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.2060505441800406, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.031907302094623446, "learning_rate": 1.9999501853685378e-05, "loss": 0.0013, "num_tokens": 9573884.0, "reward": 1.2109375, "reward_std": 0.29112139344215393, "rewards/fixed_code_pass_all_test_reward/mean": 0.2109375, "rewards/fixed_code_pass_all_test_reward/std": 0.2911214232444763, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.20623501199040767, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.03965413593687117, "learning_rate": 1.9999469197139928e-05, "loss": 0.0016, "num_tokens": 9583100.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.20641947980077477, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.11001327354460955, "learning_rate": 1.9999435503916003e-05, "loss": 0.0044, "num_tokens": 9592249.0, "reward": 0.771634578704834, "reward_std": 0.7069153189659119, "rewards/fixed_code_pass_all_test_reward/mean": 0.14663462340831757, "rewards/fixed_code_pass_all_test_reward/std": 0.3258608281612396, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.20660394761114187, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.05270298011600971, "learning_rate": 1.9999400774017105e-05, "loss": 0.0021, "num_tokens": 9596179.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 351.875, "completions/mean_terminated_length": 351.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.20678841542150894, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.021906271926127374, "learning_rate": 1.9999365007446837e-05, "loss": 0.0009, "num_tokens": 9603410.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.20697288323187604, "frac_reward_zero_std": 1.0, "grad_norm": 0.03369140625, "kl": 0.021144459256902337, "learning_rate": 1.9999328204208893e-05, "loss": 0.0008, "num_tokens": 9611079.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 251.0, "completions/mean_terminated_length": 251.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.20715735104224312, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.030967160826548934, "learning_rate": 1.9999290364307105e-05, "loss": 0.0012, "num_tokens": 9617599.0, "reward": 1.2010868787765503, "reward_std": 0.046115659177303314, "rewards/fixed_code_pass_all_test_reward/mean": 0.20108693838119507, "rewards/fixed_code_pass_all_test_reward/std": 0.046115655452013016, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.20734181885261022, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.059347836300730705, "learning_rate": 1.9999251487745386e-05, "loss": 0.0024, "num_tokens": 9626442.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.20752628666297732, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.06781802047044039, "learning_rate": 1.9999211574527767e-05, "loss": 0.0027, "num_tokens": 9631062.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2077107544733444, "frac_reward_zero_std": 1.0, "grad_norm": 0.2451171875, "kl": 0.05504463938996196, "learning_rate": 1.999917062465839e-05, "loss": 0.0022, "num_tokens": 9639241.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2078952222837115, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.029047264833934605, "learning_rate": 1.99991286381415e-05, "loss": 0.0012, "num_tokens": 9644307.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2080796900940786, "frac_reward_zero_std": 1.0, "grad_norm": 0.216796875, "kl": 0.07558548729866743, "learning_rate": 1.9999085614981443e-05, "loss": 0.003, "num_tokens": 9653181.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 95.375, "completions/mean_terminated_length": 95.375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.20826415790444566, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.06220046244561672, "learning_rate": 1.999904155518269e-05, "loss": 0.0025, "num_tokens": 9657432.0, "reward": 0.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 110.75, "completions/mean_terminated_length": 110.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.20844862571481276, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.0670712236315012, "learning_rate": 1.99989964587498e-05, "loss": 0.0027, "num_tokens": 9661014.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.20863309352517986, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.03478195331990719, "learning_rate": 1.999895032568745e-05, "loss": 0.0014, "num_tokens": 9669807.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 543.125, "completions/mean_terminated_length": 543.125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.20881756133554694, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.028197588166221976, "learning_rate": 1.999890315600043e-05, "loss": 0.0011, "num_tokens": 9685200.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.20900202914591404, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.10967082437127829, "learning_rate": 1.999885494969362e-05, "loss": 0.0044, "num_tokens": 9688918.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.20918649695628114, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0661507723852992, "learning_rate": 1.9998805706772022e-05, "loss": 0.0026, "num_tokens": 9697807.0, "reward": 1.7840908765792847, "reward_std": 0.10300161689519882, "rewards/fixed_code_pass_all_test_reward/mean": 0.7840909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.10300164669752121, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.2093709647666482, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.025793679291382432, "learning_rate": 1.9998755427240745e-05, "loss": 0.001, "num_tokens": 9707799.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2095554325770153, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04265776718966663, "learning_rate": 1.9998704111104993e-05, "loss": 0.0017, "num_tokens": 9715835.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2097399003873824, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03189303330145776, "learning_rate": 1.9998651758370097e-05, "loss": 0.0013, "num_tokens": 9724722.0, "reward": 1.5798611640930176, "reward_std": 0.46420300006866455, "rewards/fixed_code_pass_all_test_reward/mean": 0.7048611044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.4181027114391327, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.20992436819774948, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.031910013407468796, "learning_rate": 1.9998598369041474e-05, "loss": 0.0013, "num_tokens": 9735034.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.21010883600811658, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.05286032170988619, "learning_rate": 1.9998543943124667e-05, "loss": 0.0021, "num_tokens": 9739211.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.21029330381848368, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.03506349853705615, "learning_rate": 1.9998488480625314e-05, "loss": 0.0014, "num_tokens": 9746367.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.21047777162885076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.06959897326305509, "learning_rate": 1.999843198154917e-05, "loss": 0.0028, "num_tokens": 9753772.0, "reward": 1.4666666984558105, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.46666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.21066223943921786, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.0557419213000685, "learning_rate": 1.9998374445902084e-05, "loss": 0.0022, "num_tokens": 9761207.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.21084670724958496, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.02704835485201329, "learning_rate": 1.999831587369003e-05, "loss": 0.0011, "num_tokens": 9770620.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.21103117505995203, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.038854264887049794, "learning_rate": 1.9998256264919072e-05, "loss": 0.0016, "num_tokens": 9777655.0, "reward": 1.3177082538604736, "reward_std": 0.16793787479400635, "rewards/fixed_code_pass_all_test_reward/mean": 0.3177083134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.16793787479400635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.21121564287031913, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.030863576103001833, "learning_rate": 1.9998195619595396e-05, "loss": 0.0012, "num_tokens": 9783967.0, "reward": 1.545454502105713, "reward_std": 0.500295102596283, "rewards/fixed_code_pass_all_test_reward/mean": 0.5454545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.500295102596283, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.21140011068068623, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.04182061390019953, "learning_rate": 1.9998133937725284e-05, "loss": 0.0017, "num_tokens": 9790077.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2115845784910533, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.02552569843828678, "learning_rate": 1.999807121931514e-05, "loss": 0.001, "num_tokens": 9796813.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 394.75, "completions/mean_terminated_length": 394.75, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.2117690463014204, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.05180608527734876, "learning_rate": 1.9998007464371458e-05, "loss": 0.0021, "num_tokens": 9808379.0, "reward": 1.5208333730697632, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.5208333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2119535141117875, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.07691930164583027, "learning_rate": 1.999794267290085e-05, "loss": 0.0031, "num_tokens": 9817545.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.21213798192215458, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.0753004492726177, "learning_rate": 1.999787684491003e-05, "loss": 0.003, "num_tokens": 9827178.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 855.625, "completions/mean_terminated_length": 685.2857666015625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.21232244973252168, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.015278811217285693, "learning_rate": 1.999780998040583e-05, "loss": 0.0006, "num_tokens": 9841103.0, "reward": 0.689453125, "reward_std": 0.5716906189918518, "rewards/fixed_code_pass_all_test_reward/mean": 0.064453125, "rewards/fixed_code_pass_all_test_reward/std": 0.06105329096317291, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 601.5, "completions/mean_terminated_length": 601.5, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.21250691754288878, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.024926221230998635, "learning_rate": 1.9997742079395178e-05, "loss": 0.001, "num_tokens": 9852283.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.21269138535325585, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.03114215424284339, "learning_rate": 1.9997673141885113e-05, "loss": 0.0012, "num_tokens": 9860758.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 295.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.21287585316362295, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.13573351502418518, "learning_rate": 1.999760316788278e-05, "loss": 0.0054, "num_tokens": 9867538.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.21306032097399003, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.07483479520305991, "learning_rate": 1.9997532157395432e-05, "loss": 0.003, "num_tokens": 9871699.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 446.0, "completions/mean_terminated_length": 217.1428680419922, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.21324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.04914769879542291, "learning_rate": 1.9997460110430443e-05, "loss": 0.002, "num_tokens": 9880851.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.21342925659472423, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.04019129183143377, "learning_rate": 1.999738702699527e-05, "loss": 0.0016, "num_tokens": 9887631.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.2136137244050913, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.02923118695616722, "learning_rate": 1.9997312907097495e-05, "loss": 0.0012, "num_tokens": 9911400.0, "reward": 1.5873494148254395, "reward_std": 0.20349884033203125, "rewards/fixed_code_pass_all_test_reward/mean": 0.5873494148254395, "rewards/fixed_code_pass_all_test_reward/std": 0.20349884033203125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.2137981922154584, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.0431978995911777, "learning_rate": 1.9997237750744797e-05, "loss": 0.0017, "num_tokens": 9921763.0, "reward": 1.9629629850387573, "reward_std": 0.10475657135248184, "rewards/fixed_code_pass_all_test_reward/mean": 0.9629629850387573, "rewards/fixed_code_pass_all_test_reward/std": 0.10475657135248184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2139826600258255, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.043124674586579204, "learning_rate": 1.999716155794498e-05, "loss": 0.0017, "num_tokens": 9925956.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.21416712783619257, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.06822628027293831, "learning_rate": 1.9997084328705926e-05, "loss": 0.0027, "num_tokens": 9934799.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 620.125, "completions/mean_terminated_length": 620.125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.21435159564655967, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.03318669833242893, "learning_rate": 1.9997006063035655e-05, "loss": 0.0013, "num_tokens": 9950768.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.21453606345692677, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.0617794890422374, "learning_rate": 1.9996926760942276e-05, "loss": 0.0025, "num_tokens": 9955060.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.21472053126729385, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.05164128262549639, "learning_rate": 1.999684642243401e-05, "loss": 0.0021, "num_tokens": 9965975.0, "reward": 1.3295453786849976, "reward_std": 0.42898935079574585, "rewards/fixed_code_pass_all_test_reward/mean": 0.32954543828964233, "rewards/fixed_code_pass_all_test_reward/std": 0.42898938059806824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.21490499907766095, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.052842136239632964, "learning_rate": 1.999676504751919e-05, "loss": 0.0021, "num_tokens": 9970750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 447.25, "completions/mean_terminated_length": 447.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.21508946688802805, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.03744844766333699, "learning_rate": 1.999668263620625e-05, "loss": 0.0015, "num_tokens": 9981176.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.21527393469839512, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04440966295078397, "learning_rate": 1.9996599188503728e-05, "loss": 0.0018, "num_tokens": 9986293.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 263.875, "completions/mean_terminated_length": 263.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.21545840250876222, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.036768265534192324, "learning_rate": 1.9996514704420286e-05, "loss": 0.0015, "num_tokens": 9995380.0, "reward": 1.4945652484893799, "reward_std": 0.2436751127243042, "rewards/fixed_code_pass_all_test_reward/mean": 0.4945652186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.24367506802082062, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.21564287031912932, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.032264088455121964, "learning_rate": 1.9996429183964673e-05, "loss": 0.0013, "num_tokens": 10004265.0, "reward": 1.3355263471603394, "reward_std": 0.539634108543396, "rewards/fixed_code_pass_all_test_reward/mean": 0.46052634716033936, "rewards/fixed_code_pass_all_test_reward/std": 0.18608075380325317, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2158273381294964, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.05429630680009723, "learning_rate": 1.999634262714576e-05, "loss": 0.0022, "num_tokens": 10007988.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2160118059398635, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.023800981929525733, "learning_rate": 1.9996255033972524e-05, "loss": 0.001, "num_tokens": 10017148.0, "reward": 1.854567289352417, "reward_std": 0.27464327216148376, "rewards/fixed_code_pass_all_test_reward/mean": 0.854567289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.27464327216148376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 99.75, "completions/mean_terminated_length": 99.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2161962737502306, "frac_reward_zero_std": 0.0, "grad_norm": 3.515625, "kl": 0.31232333090156317, "learning_rate": 1.999616640445404e-05, "loss": 0.0125, "num_tokens": 10020602.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.21638074156059767, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.04309866961557418, "learning_rate": 1.99960767385995e-05, "loss": 0.0017, "num_tokens": 10031625.0, "reward": 1.5671296119689941, "reward_std": 0.3813115954399109, "rewards/fixed_code_pass_all_test_reward/mean": 0.5671296119689941, "rewards/fixed_code_pass_all_test_reward/std": 0.3813115954399109, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.21656520937096477, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0351749011897482, "learning_rate": 1.9995986036418196e-05, "loss": 0.0014, "num_tokens": 10040522.0, "reward": 1.8775510787963867, "reward_std": 0.026720546185970306, "rewards/fixed_code_pass_all_test_reward/mean": 0.8775510191917419, "rewards/fixed_code_pass_all_test_reward/std": 0.026720546185970306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.21674967718133187, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.05946945585310459, "learning_rate": 1.9995894297919536e-05, "loss": 0.0024, "num_tokens": 10044862.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 376.25, "completions/mean_terminated_length": 376.25, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.21693414499169894, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.03526127338409424, "learning_rate": 1.999580152311303e-05, "loss": 0.0014, "num_tokens": 10052264.0, "reward": 1.1333333253860474, "reward_std": 0.350962370634079, "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, "rewards/fixed_code_pass_all_test_reward/std": 0.35096240043640137, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.21711861280206604, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.05083850212395191, "learning_rate": 1.9995707712008294e-05, "loss": 0.002, "num_tokens": 10056902.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 396.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.21730308061243314, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.03851512516848743, "learning_rate": 1.9995612864615056e-05, "loss": 0.0015, "num_tokens": 10064431.0, "reward": 1.045454502105713, "reward_std": 0.7606000900268555, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.44203564524650574, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 426.375, "completions/mean_terminated_length": 426.375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.21748754842280021, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.06132229929789901, "learning_rate": 1.999551698094315e-05, "loss": 0.0025, "num_tokens": 10072346.0, "reward": 1.9186046123504639, "reward_std": 0.10765351355075836, "rewards/fixed_code_pass_all_test_reward/mean": 0.9186046123504639, "rewards/fixed_code_pass_all_test_reward/std": 0.10765349864959717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 396.625, "completions/mean_terminated_length": 396.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.21767201623316731, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.06797252083197236, "learning_rate": 1.999542006100251e-05, "loss": 0.0027, "num_tokens": 10082103.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 637.0, "completions/mean_terminated_length": 637.0, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.21785648404353442, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.027162770507857203, "learning_rate": 1.999532210480319e-05, "loss": 0.0011, "num_tokens": 10092703.0, "reward": 1.6875, "reward_std": 0.4381372928619385, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.4381372928619385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2180409518539015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.03437066404148936, "learning_rate": 1.9995223112355347e-05, "loss": 0.0014, "num_tokens": 10098650.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.2182254196642686, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.10229458566755056, "learning_rate": 1.9995123083669238e-05, "loss": 0.0041, "num_tokens": 10102487.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 557.625, "completions/mean_terminated_length": 557.625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.2184098874746357, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.03221483645029366, "learning_rate": 1.999502201875524e-05, "loss": 0.0013, "num_tokens": 10112308.0, "reward": 1.074519157409668, "reward_std": 0.21077220141887665, "rewards/fixed_code_pass_all_test_reward/mean": 0.07451923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.21077223122119904, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.21859435528500276, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.04646780528128147, "learning_rate": 1.9994919917623822e-05, "loss": 0.0019, "num_tokens": 10117519.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 402.625, "completions/mean_terminated_length": 402.625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.21877882309536986, "frac_reward_zero_std": 0.0, "grad_norm": 5.96875, "kl": 0.43370580673217773, "learning_rate": 1.9994816780285576e-05, "loss": 0.0173, "num_tokens": 10127476.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 332.125, "completions/mean_terminated_length": 332.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.21896329090573696, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0636683851480484, "learning_rate": 1.9994712606751197e-05, "loss": 0.0025, "num_tokens": 10134413.0, "reward": 1.375, "reward_std": 0.39247557520866394, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.39247557520866394, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.21914775871610404, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.08024193253368139, "learning_rate": 1.9994607397031477e-05, "loss": 0.0032, "num_tokens": 10140774.0, "reward": 1.8571429252624512, "reward_std": 0.15742090344429016, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.15742090344429016, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.21933222652647114, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.05488387937657535, "learning_rate": 1.9994501151137328e-05, "loss": 0.0022, "num_tokens": 10144972.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.2195166943368382, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.10198327852413058, "learning_rate": 1.9994393869079765e-05, "loss": 0.0041, "num_tokens": 10148878.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.2197011621472053, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.054983296198770404, "learning_rate": 1.999428555086991e-05, "loss": 0.0022, "num_tokens": 10154141.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 530.875, "completions/mean_terminated_length": 530.875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.2198856299575724, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.03594265994615853, "learning_rate": 1.999417619651899e-05, "loss": 0.0014, "num_tokens": 10164020.0, "reward": 1.0750000476837158, "reward_std": 0.10350986570119858, "rewards/fixed_code_pass_all_test_reward/mean": 0.07500000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.10350984334945679, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 551.5, "completions/mean_terminated_length": 551.5, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.22007009776793948, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.03255474101752043, "learning_rate": 1.9994065806038345e-05, "loss": 0.0013, "num_tokens": 10176896.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.22025456557830658, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.1157928598113358, "learning_rate": 1.999395437943942e-05, "loss": 0.0046, "num_tokens": 10183313.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 368.25, "completions/mean_terminated_length": 368.25, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.22043903338867368, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0562732694670558, "learning_rate": 1.9993841916733764e-05, "loss": 0.0023, "num_tokens": 10194379.0, "reward": 1.375, "reward_std": 0.2883436381816864, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.288343608379364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.22062350119904076, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.04111891775391996, "learning_rate": 1.9993728417933044e-05, "loss": 0.0016, "num_tokens": 10199104.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.22080796900940786, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.08965247729793191, "learning_rate": 1.9993613883049015e-05, "loss": 0.0036, "num_tokens": 10203285.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 545.875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.22099243681977496, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.03836157638579607, "learning_rate": 1.9993498312093557e-05, "loss": 0.0015, "num_tokens": 10213372.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.22117690463014203, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.0786507367156446, "learning_rate": 1.9993381705078657e-05, "loss": 0.0031, "num_tokens": 10217733.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.22136137244050913, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.09480518661439419, "learning_rate": 1.9993264062016397e-05, "loss": 0.0038, "num_tokens": 10227477.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 512.875, "completions/mean_terminated_length": 293.5714416503906, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.22154584025087623, "frac_reward_zero_std": 0.0, "grad_norm": 0.64453125, "kl": 0.0554881856078282, "learning_rate": 1.999314538291897e-05, "loss": 0.0022, "num_tokens": 10238108.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2217303080612433, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.04839163552969694, "learning_rate": 1.999302566779869e-05, "loss": 0.0019, "num_tokens": 10245287.0, "reward": 1.4345238208770752, "reward_std": 0.4710178077220917, "rewards/fixed_code_pass_all_test_reward/mean": 0.4345238208770752, "rewards/fixed_code_pass_all_test_reward/std": 0.47101783752441406, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.2219147758716104, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.0687980311922729, "learning_rate": 1.9992904916667963e-05, "loss": 0.0028, "num_tokens": 10252230.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2220992436819775, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.053124125581234694, "learning_rate": 1.999278312953931e-05, "loss": 0.0021, "num_tokens": 10257602.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 607.375, "completions/mean_terminated_length": 607.375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.22228371149234458, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.04890149366110563, "learning_rate": 1.9992660306425353e-05, "loss": 0.002, "num_tokens": 10268693.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2781743109226227, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.22246817930271168, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.04487239685840905, "learning_rate": 1.999253644733883e-05, "loss": 0.0018, "num_tokens": 10273185.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.22265264711307878, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.044575080974027514, "learning_rate": 1.9992411552292575e-05, "loss": 0.0018, "num_tokens": 10283377.0, "reward": 1.46875, "reward_std": 0.3830161690711975, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.3830161690711975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.22283711492344585, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08674538531340659, "learning_rate": 1.999228562129954e-05, "loss": 0.0035, "num_tokens": 10288386.0, "reward": 1.7321428060531616, "reward_std": 0.36967799067497253, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 521.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.22302158273381295, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.05816185474395752, "learning_rate": 1.999215865437279e-05, "loss": 0.0023, "num_tokens": 10300866.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.22320605054418005, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.06780396401882172, "learning_rate": 1.9992030651525474e-05, "loss": 0.0027, "num_tokens": 10305818.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.22339051835454712, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.03649728512391448, "learning_rate": 1.999190161277087e-05, "loss": 0.0015, "num_tokens": 10312879.0, "reward": 1.85326087474823, "reward_std": 0.13931958377361298, "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.1393195539712906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.22357498616491422, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.06630427250638604, "learning_rate": 1.9991771538122354e-05, "loss": 0.0027, "num_tokens": 10319743.0, "reward": 1.735795497894287, "reward_std": 0.09326291084289551, "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.09326295554637909, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.22375945397528132, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.05690090078860521, "learning_rate": 1.9991640427593412e-05, "loss": 0.0023, "num_tokens": 10330176.0, "reward": 1.0535714626312256, "reward_std": 0.10287558287382126, "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714253783226, "rewards/fixed_code_pass_all_test_reward/std": 0.10287559032440186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2239439217856484, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0801225071772933, "learning_rate": 1.9991508281197634e-05, "loss": 0.0032, "num_tokens": 10339495.0, "reward": 1.3046875, "reward_std": 0.2828575074672699, "rewards/fixed_code_pass_all_test_reward/mean": 0.3046875, "rewards/fixed_code_pass_all_test_reward/std": 0.2828575074672699, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2241283895960155, "frac_reward_zero_std": 1.0, "grad_norm": 0.388671875, "kl": 0.12518868362531066, "learning_rate": 1.999137509894872e-05, "loss": 0.005, "num_tokens": 10343251.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2243128574063826, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.11864777142181993, "learning_rate": 1.9991240880860484e-05, "loss": 0.0047, "num_tokens": 10347126.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.22449732521674967, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.07289290986955166, "learning_rate": 1.9991105626946834e-05, "loss": 0.0029, "num_tokens": 10353785.0, "reward": 1.0178570747375488, "reward_std": 0.05050762742757797, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.22468179302711677, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.045955864479765296, "learning_rate": 1.9990969337221794e-05, "loss": 0.0018, "num_tokens": 10360826.0, "reward": 1.5125000476837158, "reward_std": 0.5221863389015198, "rewards/fixed_code_pass_all_test_reward/mean": 0.512499988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.5221863389015198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.22486626083748387, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.0777191398665309, "learning_rate": 1.9990832011699496e-05, "loss": 0.0031, "num_tokens": 10365215.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.22505072864785094, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.07428921409882605, "learning_rate": 1.9990693650394176e-05, "loss": 0.003, "num_tokens": 10375246.0, "reward": 1.258333444595337, "reward_std": 0.5639289617538452, "rewards/fixed_code_pass_all_test_reward/mean": 0.38333332538604736, "rewards/fixed_code_pass_all_test_reward/std": 0.3347588777542114, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 357.625, "completions/mean_terminated_length": 357.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.22523519645821805, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.051717916037887335, "learning_rate": 1.9990554253320177e-05, "loss": 0.0021, "num_tokens": 10385923.0, "reward": 1.5643938779830933, "reward_std": 0.36756086349487305, "rewards/fixed_code_pass_all_test_reward/mean": 0.5643938779830933, "rewards/fixed_code_pass_all_test_reward/std": 0.36756089329719543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.22541966426858512, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.06573385745286942, "learning_rate": 1.999041382049195e-05, "loss": 0.0026, "num_tokens": 10390840.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 516.625, "completions/mean_terminated_length": 516.625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.22560413207895222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.04347809497267008, "learning_rate": 1.9990272351924057e-05, "loss": 0.0017, "num_tokens": 10401357.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 549.25, "completions/mean_terminated_length": 549.25, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.22578859988931932, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.03982335375621915, "learning_rate": 1.9990129847631162e-05, "loss": 0.0016, "num_tokens": 10416863.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.2259730676996864, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.158511595800519, "learning_rate": 1.9989986307628036e-05, "loss": 0.0063, "num_tokens": 10421690.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.2261575355100535, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.05229078442789614, "learning_rate": 1.998984173192957e-05, "loss": 0.0021, "num_tokens": 10428085.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2263420033204206, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.10355564951896667, "learning_rate": 1.9989696120550744e-05, "loss": 0.0041, "num_tokens": 10438262.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 396.0, "completions/mean_terminated_length": 396.0, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.22652647113078767, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04168181819841266, "learning_rate": 1.998954947350666e-05, "loss": 0.0017, "num_tokens": 10446662.0, "reward": 1.5357143878936768, "reward_std": 0.3561210036277771, "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3561210036277771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.22671093894115477, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.059747880324721336, "learning_rate": 1.9989401790812516e-05, "loss": 0.0024, "num_tokens": 10454541.0, "reward": 1.4090909957885742, "reward_std": 0.4312196671962738, "rewards/fixed_code_pass_all_test_reward/mean": 0.40909087657928467, "rewards/fixed_code_pass_all_test_reward/std": 0.4312196969985962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.22689540675152187, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.037049652775749564, "learning_rate": 1.9989253072483625e-05, "loss": 0.0015, "num_tokens": 10465016.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.22707987456188894, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.055796783650293946, "learning_rate": 1.9989103318535403e-05, "loss": 0.0022, "num_tokens": 10469410.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.22726434237225604, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.06423219572752714, "learning_rate": 1.998895252898338e-05, "loss": 0.0026, "num_tokens": 10478417.0, "reward": 1.90234375, "reward_std": 0.2762135863304138, "rewards/fixed_code_pass_all_test_reward/mean": 0.90234375, "rewards/fixed_code_pass_all_test_reward/std": 0.2762135863304138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.22744881018262314, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.14238657895475626, "learning_rate": 1.9988800703843187e-05, "loss": 0.0057, "num_tokens": 10484500.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.2276332779929902, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.03517152089625597, "learning_rate": 1.998864784313056e-05, "loss": 0.0014, "num_tokens": 10492116.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2278177458033573, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.04612201149575412, "learning_rate": 1.9988493946861355e-05, "loss": 0.0018, "num_tokens": 10500716.0, "reward": 1.826171875, "reward_std": 0.2656168043613434, "rewards/fixed_code_pass_all_test_reward/mean": 0.826171875, "rewards/fixed_code_pass_all_test_reward/std": 0.2656168043613434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2280022136137244, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.06437376234680414, "learning_rate": 1.998833901505152e-05, "loss": 0.0026, "num_tokens": 10506959.0, "reward": 1.9318182468414307, "reward_std": 0.19284728169441223, "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2281866814240915, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.06153057049959898, "learning_rate": 1.998818304771712e-05, "loss": 0.0025, "num_tokens": 10516288.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2283711492344586, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.06342857144773006, "learning_rate": 1.998802604487433e-05, "loss": 0.0025, "num_tokens": 10522442.0, "reward": 1.3145160675048828, "reward_std": 0.42308980226516724, "rewards/fixed_code_pass_all_test_reward/mean": 0.3145161271095276, "rewards/fixed_code_pass_all_test_reward/std": 0.4230898320674896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2285556170448257, "frac_reward_zero_std": 1.0, "grad_norm": 0.21484375, "kl": 0.04567981022410095, "learning_rate": 1.998786800653941e-05, "loss": 0.0018, "num_tokens": 10531265.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.22874008485519276, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.050369120202958584, "learning_rate": 1.998770893272876e-05, "loss": 0.002, "num_tokens": 10541315.0, "reward": 1.9943182468414307, "reward_std": 0.0035068909637629986, "rewards/fixed_code_pass_all_test_reward/mean": 0.9943181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0035068909637629986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.22892455266555986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.03831752552650869, "learning_rate": 1.9987548823458868e-05, "loss": 0.0015, "num_tokens": 10548433.0, "reward": 1.8461538553237915, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.22910902047592696, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.054408963304013014, "learning_rate": 1.9987387678746334e-05, "loss": 0.0022, "num_tokens": 10557420.0, "reward": 1.5, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.22929348828629403, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.047366100596264005, "learning_rate": 1.998722549860786e-05, "loss": 0.0019, "num_tokens": 10561259.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.22947795609666113, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.09962185856420547, "learning_rate": 1.9987062283060264e-05, "loss": 0.004, "num_tokens": 10567613.0, "reward": 1.9444444179534912, "reward_std": 0.15713484585285187, "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.15713483095169067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 549.25, "completions/mean_terminated_length": 549.25, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.22966242390702823, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.023867676500231028, "learning_rate": 1.9986898032120466e-05, "loss": 0.001, "num_tokens": 10578655.0, "reward": 0.9208332896232605, "reward_std": 0.37583237886428833, "rewards/fixed_code_pass_all_test_reward/mean": 0.04583333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.05616727098822594, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2298468917173953, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.08998010493814945, "learning_rate": 1.9986732745805494e-05, "loss": 0.0036, "num_tokens": 10586067.0, "reward": 1.6666667461395264, "reward_std": 0.2760262191295624, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2760262191295624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2300313595277624, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.056372721679508686, "learning_rate": 1.998656642413248e-05, "loss": 0.0023, "num_tokens": 10597049.0, "reward": 1.295454502105713, "reward_std": 0.13527704775333405, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.13527706265449524, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2302158273381295, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.04929253668524325, "learning_rate": 1.9986399067118673e-05, "loss": 0.002, "num_tokens": 10602252.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.23040029514849658, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.03676022926811129, "learning_rate": 1.9986230674781425e-05, "loss": 0.0015, "num_tokens": 10606877.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.23058476295886368, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.052048672921955585, "learning_rate": 1.9986061247138188e-05, "loss": 0.0021, "num_tokens": 10612193.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.23076923076923078, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0927588976919651, "learning_rate": 1.998589078420653e-05, "loss": 0.0037, "num_tokens": 10618567.0, "reward": 1.1576086282730103, "reward_std": 0.06547567248344421, "rewards/fixed_code_pass_all_test_reward/mean": 0.15760868787765503, "rewards/fixed_code_pass_all_test_reward/std": 0.06547567993402481, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.23095369857959785, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.06392569048330188, "learning_rate": 1.998571928600412e-05, "loss": 0.0026, "num_tokens": 10627246.0, "reward": 1.75, "reward_std": 0.37988796830177307, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37988796830177307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 380.875, "completions/mean_terminated_length": 380.875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.23113816638996496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.026241635205224156, "learning_rate": 1.9985546752548742e-05, "loss": 0.001, "num_tokens": 10634605.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.23132263420033206, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.06730982707813382, "learning_rate": 1.9985373183858282e-05, "loss": 0.0027, "num_tokens": 10644358.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.23150710201069913, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.03514389740303159, "learning_rate": 1.9985198579950734e-05, "loss": 0.0014, "num_tokens": 10651454.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.23169156982106623, "frac_reward_zero_std": 1.0, "grad_norm": 0.42578125, "kl": 0.12064300291240215, "learning_rate": 1.99850229408442e-05, "loss": 0.0048, "num_tokens": 10658958.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2318760376314333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.03367142647039145, "learning_rate": 1.998484626655689e-05, "loss": 0.0013, "num_tokens": 10667432.0, "reward": 1.933333396911621, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9333333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.2320605054418004, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.05345094995573163, "learning_rate": 1.998466855710712e-05, "loss": 0.0021, "num_tokens": 10675880.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2322449732521675, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.048464043298736215, "learning_rate": 1.9984489812513307e-05, "loss": 0.0019, "num_tokens": 10683234.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.23242944106253458, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05444542760960758, "learning_rate": 1.9984310032793997e-05, "loss": 0.0022, "num_tokens": 10691129.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.23261390887290168, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.08660539775155485, "learning_rate": 1.9984129217967815e-05, "loss": 0.0035, "num_tokens": 10697134.0, "reward": 1.5625, "reward_std": 0.47715675830841064, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.47715675830841064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.23279837668326878, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.05137174506671727, "learning_rate": 1.998394736805351e-05, "loss": 0.0021, "num_tokens": 10706314.0, "reward": 1.5555555820465088, "reward_std": 0.31426969170570374, "rewards/fixed_code_pass_all_test_reward/mean": 0.5555555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.23298284449363585, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.061082344967871904, "learning_rate": 1.998376448306994e-05, "loss": 0.0024, "num_tokens": 10710504.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 571.5, "completions/mean_terminated_length": 571.5, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.23316731230400295, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.01376206730492413, "learning_rate": 1.998358056303606e-05, "loss": 0.0006, "num_tokens": 10725764.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.23335178011437005, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0960350469686091, "learning_rate": 1.9983395607970938e-05, "loss": 0.0038, "num_tokens": 10732306.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.23353624792473712, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0864503406919539, "learning_rate": 1.998320961789375e-05, "loss": 0.0035, "num_tokens": 10739036.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.23372071573510422, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.033696640748530626, "learning_rate": 1.998302259282378e-05, "loss": 0.0013, "num_tokens": 10749324.0, "reward": 1.6590908765792847, "reward_std": 0.21041366457939148, "rewards/fixed_code_pass_all_test_reward/mean": 0.6590908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.21041364967823029, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.23390518354547132, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.148740918841213, "learning_rate": 1.9982834532780414e-05, "loss": 0.0059, "num_tokens": 10755722.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.2340896513558384, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.065625402610749, "learning_rate": 1.9982645437783152e-05, "loss": 0.0026, "num_tokens": 10767023.0, "reward": 1.534926414489746, "reward_std": 0.4382210373878479, "rewards/fixed_code_pass_all_test_reward/mean": 0.6599264740943909, "rewards/fixed_code_pass_all_test_reward/std": 0.14609718322753906, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.2342741191662055, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04834607429802418, "learning_rate": 1.9982455307851598e-05, "loss": 0.0019, "num_tokens": 10775143.0, "reward": 1.921875, "reward_std": 0.07867428660392761, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.07867428660392761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.2344585869765726, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.05608690553344786, "learning_rate": 1.998226414300546e-05, "loss": 0.0022, "num_tokens": 10784618.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.23464305478693967, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.08392241224646568, "learning_rate": 1.9982071943264557e-05, "loss": 0.0034, "num_tokens": 10790762.0, "reward": 1.2638888359069824, "reward_std": 0.19641852378845215, "rewards/fixed_code_pass_all_test_reward/mean": 0.2638888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.19641855359077454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 363.125, "completions/mean_terminated_length": 363.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.23482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.04167533200234175, "learning_rate": 1.998187870864882e-05, "loss": 0.0017, "num_tokens": 10798339.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.23501199040767387, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.08124719653278589, "learning_rate": 1.998168443917828e-05, "loss": 0.0032, "num_tokens": 10805213.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.23519645821804094, "frac_reward_zero_std": 1.0, "grad_norm": 0.2294921875, "kl": 0.11112395022064447, "learning_rate": 1.9981489134873075e-05, "loss": 0.0044, "num_tokens": 10811764.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.23538092602840804, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05705802049487829, "learning_rate": 1.9981292795753453e-05, "loss": 0.0023, "num_tokens": 10818990.0, "reward": 1.0806450843811035, "reward_std": 0.08621326833963394, "rewards/fixed_code_pass_all_test_reward/mean": 0.08064515888690948, "rewards/fixed_code_pass_all_test_reward/std": 0.08621330559253693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.23556539383877514, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.08685056306421757, "learning_rate": 1.9981095421839772e-05, "loss": 0.0035, "num_tokens": 10830161.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.23574986164914222, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.0722743347287178, "learning_rate": 1.9980897013152493e-05, "loss": 0.0029, "num_tokens": 10838619.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.23593432945950932, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.0664640236645937, "learning_rate": 1.9980697569712187e-05, "loss": 0.0027, "num_tokens": 10847803.0, "reward": 1.2770271301269531, "reward_std": 0.14062222838401794, "rewards/fixed_code_pass_all_test_reward/mean": 0.2770270109176636, "rewards/fixed_code_pass_all_test_reward/std": 0.14062219858169556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.23611879726987642, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.07362988218665123, "learning_rate": 1.998049709153953e-05, "loss": 0.0029, "num_tokens": 10856341.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2363032650802435, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.05044866888783872, "learning_rate": 1.9980295578655303e-05, "loss": 0.002, "num_tokens": 10864117.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2364877328906106, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.06563374446704984, "learning_rate": 1.9980093031080403e-05, "loss": 0.0026, "num_tokens": 10868714.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.2366722007009777, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.07010183844249696, "learning_rate": 1.9979889448835825e-05, "loss": 0.0028, "num_tokens": 10875778.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.23685666851134476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.0588835421949625, "learning_rate": 1.9979684831942677e-05, "loss": 0.0024, "num_tokens": 10883650.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.23704113632171186, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.052842499455437064, "learning_rate": 1.997947918042217e-05, "loss": 0.0021, "num_tokens": 10887483.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.23722560413207897, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.07487613963894546, "learning_rate": 1.997927249429563e-05, "loss": 0.003, "num_tokens": 10893719.0, "reward": 1.4047620296478271, "reward_std": 0.4929039478302002, "rewards/fixed_code_pass_all_test_reward/mean": 0.4047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.4929039776325226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.23741007194244604, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.0552777883131057, "learning_rate": 1.997906477358448e-05, "loss": 0.0022, "num_tokens": 10903427.0, "reward": 1.125, "reward_std": 0.24800796806812286, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.24800795316696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.23759453975281314, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.07676322432234883, "learning_rate": 1.9978856018310253e-05, "loss": 0.0031, "num_tokens": 10911636.0, "reward": 1.0074999332427979, "reward_std": 0.02121318317949772, "rewards/fixed_code_pass_all_test_reward/mean": 0.007499999832361937, "rewards/fixed_code_pass_all_test_reward/std": 0.02121320366859436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 511.375, "completions/mean_terminated_length": 511.375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.2377790075631802, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.03342886897735298, "learning_rate": 1.99786462284946e-05, "loss": 0.0013, "num_tokens": 10921079.0, "reward": 1.4208333492279053, "reward_std": 0.6381862759590149, "rewards/fixed_code_pass_all_test_reward/mean": 0.5458333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.35542842745780945, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.2379634753735473, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.09023055341094732, "learning_rate": 1.997843540415926e-05, "loss": 0.0036, "num_tokens": 10930633.0, "reward": 1.9027777910232544, "reward_std": 0.2749859392642975, "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2381479431839144, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.0638743401505053, "learning_rate": 1.9978223545326097e-05, "loss": 0.0026, "num_tokens": 10942109.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.23833241099428149, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.0773246050812304, "learning_rate": 1.9978010652017074e-05, "loss": 0.0031, "num_tokens": 10950924.0, "reward": 1.2333333492279053, "reward_std": 0.09428088366985321, "rewards/fixed_code_pass_all_test_reward/mean": 0.23333334922790527, "rewards/fixed_code_pass_all_test_reward/std": 0.0942809134721756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.23851687880464859, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.04100760817527771, "learning_rate": 1.997779672425426e-05, "loss": 0.0016, "num_tokens": 10960991.0, "reward": 1.1006944179534912, "reward_std": 0.18659758567810059, "rewards/fixed_code_pass_all_test_reward/mean": 0.1006944477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.18659763038158417, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 480.625, "completions/mean_terminated_length": 480.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.23870134661501569, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.03814176539890468, "learning_rate": 1.9977581762059833e-05, "loss": 0.0015, "num_tokens": 10968364.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.23888581442538276, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.031211954541504383, "learning_rate": 1.9977365765456085e-05, "loss": 0.0012, "num_tokens": 10975145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.23907028223574986, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.09488718491047621, "learning_rate": 1.9977148734465403e-05, "loss": 0.0038, "num_tokens": 10986532.0, "reward": 1.8416666984558105, "reward_std": 0.15911462903022766, "rewards/fixed_code_pass_all_test_reward/mean": 0.8416666388511658, "rewards/fixed_code_pass_all_test_reward/std": 0.15911459922790527, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.23925475004611696, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.05824032728560269, "learning_rate": 1.9976930669110292e-05, "loss": 0.0023, "num_tokens": 10995177.0, "reward": 1.0364582538604736, "reward_std": 0.014731382951140404, "rewards/fixed_code_pass_all_test_reward/mean": 0.0364583358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.01473139226436615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.23943921785648403, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.03924765903502703, "learning_rate": 1.9976711569413353e-05, "loss": 0.0016, "num_tokens": 10999179.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 445.25, "completions/mean_terminated_length": 445.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.23962368566685113, "frac_reward_zero_std": 1.0, "grad_norm": 0.03955078125, "kl": 0.021552508231252432, "learning_rate": 1.997649143539731e-05, "loss": 0.0009, "num_tokens": 11008221.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 468.875, "completions/mean_terminated_length": 468.875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.23980815347721823, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.06330589158460498, "learning_rate": 1.9976270267084974e-05, "loss": 0.0025, "num_tokens": 11017596.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2399926212875853, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.06948912097141147, "learning_rate": 1.9976048064499283e-05, "loss": 0.0028, "num_tokens": 11023371.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2401770890979524, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.07741766143590212, "learning_rate": 1.997582482766327e-05, "loss": 0.0031, "num_tokens": 11029352.0, "reward": 1.375, "reward_std": 0.41924625635147095, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.41924628615379333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 441.25, "completions/mean_terminated_length": 441.25, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.2403615569083195, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.06529013020917773, "learning_rate": 1.997560055660008e-05, "loss": 0.0026, "num_tokens": 11042450.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.24054602471868658, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.07117086555808783, "learning_rate": 1.997537525133296e-05, "loss": 0.0028, "num_tokens": 11053252.0, "reward": 1.6666667461395264, "reward_std": 0.46386152505874634, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4638615846633911, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.24073049252905368, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.07933639688417315, "learning_rate": 1.9975148911885275e-05, "loss": 0.0032, "num_tokens": 11065407.0, "reward": 1.8705356121063232, "reward_std": 0.1502397060394287, "rewards/fixed_code_pass_all_test_reward/mean": 0.8705357313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.1502397358417511, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.24091496033942078, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.07196844788268209, "learning_rate": 1.9974921538280485e-05, "loss": 0.0029, "num_tokens": 11076619.0, "reward": 1.5597825050354004, "reward_std": 0.015371894463896751, "rewards/fixed_code_pass_all_test_reward/mean": 0.5597826242446899, "rewards/fixed_code_pass_all_test_reward/std": 0.015371883288025856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 312.25, "completions/mean_terminated_length": 312.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.24109942814978785, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04975930065847933, "learning_rate": 1.9974693130542168e-05, "loss": 0.002, "num_tokens": 11084085.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.24128389596015495, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.06995000224560499, "learning_rate": 1.9974463688693994e-05, "loss": 0.0028, "num_tokens": 11094385.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.24146836377052205, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.16098308051005006, "learning_rate": 1.9974233212759758e-05, "loss": 0.0064, "num_tokens": 11103242.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.24165283158088913, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03872451093047857, "learning_rate": 1.9974001702763356e-05, "loss": 0.0015, "num_tokens": 11109930.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.24183729939125623, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.02910477132536471, "learning_rate": 1.997376915872878e-05, "loss": 0.0012, "num_tokens": 11114430.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.24202176720162333, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.053528830176219344, "learning_rate": 1.997353558068015e-05, "loss": 0.0021, "num_tokens": 11120962.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.2422062350119904, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.06050563510507345, "learning_rate": 1.9973300968641675e-05, "loss": 0.0024, "num_tokens": 11130142.0, "reward": 1.861328125, "reward_std": 0.20920459926128387, "rewards/fixed_code_pass_all_test_reward/mean": 0.861328125, "rewards/fixed_code_pass_all_test_reward/std": 0.20920461416244507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.2423907028223575, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.021936406672466546, "learning_rate": 1.9973065322637673e-05, "loss": 0.0009, "num_tokens": 11136449.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2425751706327246, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.06247883662581444, "learning_rate": 1.9972828642692587e-05, "loss": 0.0025, "num_tokens": 11141647.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.24275963844309167, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.05816083587706089, "learning_rate": 1.9972590928830945e-05, "loss": 0.0023, "num_tokens": 11146079.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.24294410625345877, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.04927245411090553, "learning_rate": 1.9972352181077393e-05, "loss": 0.002, "num_tokens": 11150410.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.24312857406382588, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.06258185068145394, "learning_rate": 1.997211239945669e-05, "loss": 0.0025, "num_tokens": 11161355.0, "reward": 1.586538553237915, "reward_std": 0.3653123080730438, "rewards/fixed_code_pass_all_test_reward/mean": 0.5865384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.3653123378753662, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.24331304187419295, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.07633263152092695, "learning_rate": 1.9971871583993684e-05, "loss": 0.0031, "num_tokens": 11170282.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.24349750968456005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.08423224557191133, "learning_rate": 1.9971629734713346e-05, "loss": 0.0034, "num_tokens": 11179057.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.24368197749492715, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.033582302974537015, "learning_rate": 1.9971386851640754e-05, "loss": 0.0013, "num_tokens": 11182897.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.24386644530529422, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.0745147867128253, "learning_rate": 1.997114293480108e-05, "loss": 0.003, "num_tokens": 11188192.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.24405091311566132, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.052544296719133854, "learning_rate": 1.9970897984219614e-05, "loss": 0.0021, "num_tokens": 11192615.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.2442353809260284, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.031214418821036816, "learning_rate": 1.997065199992176e-05, "loss": 0.0012, "num_tokens": 11197049.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2444198487363955, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.03052685991860926, "learning_rate": 1.9970404981933006e-05, "loss": 0.0012, "num_tokens": 11202376.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2446043165467626, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.029898729408159852, "learning_rate": 1.997015693027897e-05, "loss": 0.0012, "num_tokens": 11212338.0, "reward": 1.658046007156372, "reward_std": 0.25195762515068054, "rewards/fixed_code_pass_all_test_reward/mean": 0.6580460071563721, "rewards/fixed_code_pass_all_test_reward/std": 0.25195759534835815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 451.25, "completions/mean_terminated_length": 451.25, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.24478878435712967, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.05232251877896488, "learning_rate": 1.9969907844985366e-05, "loss": 0.0021, "num_tokens": 11224412.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.24497325216749677, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.03530004946514964, "learning_rate": 1.9969657726078017e-05, "loss": 0.0014, "num_tokens": 11232050.0, "reward": 1.9642857313156128, "reward_std": 0.10101523250341415, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.24515771997786387, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.05684534879401326, "learning_rate": 1.9969406573582857e-05, "loss": 0.0023, "num_tokens": 11236686.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.24534218778823094, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.07199973566457629, "learning_rate": 1.9969154387525918e-05, "loss": 0.0029, "num_tokens": 11243822.0, "reward": 1.8461538553237915, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 351.0, "completions/mean_terminated_length": 351.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.24552665559859804, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.046237635193392634, "learning_rate": 1.996890116793335e-05, "loss": 0.0018, "num_tokens": 11253350.0, "reward": 1.322115421295166, "reward_std": 0.4202892482280731, "rewards/fixed_code_pass_all_test_reward/mean": 0.32211539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.4202892780303955, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.24571112340896514, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.055426849983632565, "learning_rate": 1.9968646914831402e-05, "loss": 0.0022, "num_tokens": 11265458.0, "reward": 1.7842261791229248, "reward_std": 0.03788074478507042, "rewards/fixed_code_pass_all_test_reward/mean": 0.7842261791229248, "rewards/fixed_code_pass_all_test_reward/std": 0.03788072615861893, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.24589559121933222, "frac_reward_zero_std": 1.0, "grad_norm": 0.251953125, "kl": 0.06481029884889722, "learning_rate": 1.9968391628246436e-05, "loss": 0.0026, "num_tokens": 11272888.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.24608005902969932, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.08548794966191053, "learning_rate": 1.9968135308204917e-05, "loss": 0.0034, "num_tokens": 11281721.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 390.125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.24626452684006642, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.033831899520009756, "learning_rate": 1.9967877954733413e-05, "loss": 0.0014, "num_tokens": 11290178.0, "reward": 1.9291666746139526, "reward_std": 0.04520680010318756, "rewards/fixed_code_pass_all_test_reward/mean": 0.9291666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.045206762850284576, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2464489946504335, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.04918299335986376, "learning_rate": 1.9967619567858617e-05, "loss": 0.002, "num_tokens": 11294183.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2466334624608006, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.025479455944150686, "learning_rate": 1.9967360147607307e-05, "loss": 0.001, "num_tokens": 11301317.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2468179302711677, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.04540640325285494, "learning_rate": 1.9967099694006384e-05, "loss": 0.0018, "num_tokens": 11305943.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.24700239808153476, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.05060215573757887, "learning_rate": 1.9966838207082843e-05, "loss": 0.002, "num_tokens": 11310175.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.24718686589190186, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.06601592618972063, "learning_rate": 1.99665756868638e-05, "loss": 0.0026, "num_tokens": 11321286.0, "reward": 1.0543477535247803, "reward_std": 0.045004263520240784, "rewards/fixed_code_pass_all_test_reward/mean": 0.05434782803058624, "rewards/fixed_code_pass_all_test_reward/std": 0.04500427842140198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.24737133370226896, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.09385432559065521, "learning_rate": 1.9966312133376466e-05, "loss": 0.0038, "num_tokens": 11325274.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.24755580151263604, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.053817325038835406, "learning_rate": 1.996604754664817e-05, "loss": 0.0022, "num_tokens": 11329448.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.24774026932300314, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.05339163402095437, "learning_rate": 1.996578192670634e-05, "loss": 0.0021, "num_tokens": 11337467.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.24792473713337024, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.08137089665979147, "learning_rate": 1.996551527357851e-05, "loss": 0.0033, "num_tokens": 11346537.0, "reward": 1.54347825050354, "reward_std": 0.18446266651153564, "rewards/fixed_code_pass_all_test_reward/mean": 0.54347825050354, "rewards/fixed_code_pass_all_test_reward/std": 0.18446263670921326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2481092049437373, "frac_reward_zero_std": 1.0, "grad_norm": 0.255859375, "kl": 0.058191659627482295, "learning_rate": 1.996524758729233e-05, "loss": 0.0023, "num_tokens": 11352279.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2482936727541044, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.1121706934645772, "learning_rate": 1.996497886787555e-05, "loss": 0.0045, "num_tokens": 11361615.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 378.125, "completions/mean_terminated_length": 378.125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.2484781405644715, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.048165290616452694, "learning_rate": 1.996470911535603e-05, "loss": 0.0019, "num_tokens": 11369528.0, "reward": 1.7761627435684204, "reward_std": 0.08958551287651062, "rewards/fixed_code_pass_all_test_reward/mean": 0.7761628031730652, "rewards/fixed_code_pass_all_test_reward/std": 0.0895855501294136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 326.625, "completions/mean_terminated_length": 326.625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.24866260837483858, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.05890075536444783, "learning_rate": 1.9964438329761736e-05, "loss": 0.0024, "num_tokens": 11379549.0, "reward": 1.9509494304656982, "reward_std": 0.13873612880706787, "rewards/fixed_code_pass_all_test_reward/mean": 0.9509493708610535, "rewards/fixed_code_pass_all_test_reward/std": 0.13873615860939026, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.24884707618520568, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.0925910216756165, "learning_rate": 1.9964166511120736e-05, "loss": 0.0037, "num_tokens": 11387131.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.24903154399557278, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.06029292428866029, "learning_rate": 1.9963893659461223e-05, "loss": 0.0024, "num_tokens": 11396186.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.24921601180593986, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875, "kl": 0.04121057072188705, "learning_rate": 1.9963619774811467e-05, "loss": 0.0016, "num_tokens": 11399993.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.24940047961630696, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0696862330660224, "learning_rate": 1.996334485719988e-05, "loss": 0.0028, "num_tokens": 11408063.0, "reward": 1.9166667461395264, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 582.75, "completions/mean_terminated_length": 582.75, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.24958494742667406, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.041105630807578564, "learning_rate": 1.996306890665495e-05, "loss": 0.0016, "num_tokens": 11425253.0, "reward": 1.1190476417541504, "reward_std": 0.33671751618385315, "rewards/fixed_code_pass_all_test_reward/mean": 0.1190476194024086, "rewards/fixed_code_pass_all_test_reward/std": 0.33671751618385315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.24976941523704113, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.045710903126746416, "learning_rate": 1.9962791923205296e-05, "loss": 0.0018, "num_tokens": 11433806.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.24995388304740823, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.07209481485188007, "learning_rate": 1.9962513906879626e-05, "loss": 0.0029, "num_tokens": 11445056.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.2501383508577753, "frac_reward_zero_std": 1.0, "grad_norm": 0.053955078125, "kl": 0.03035471262410283, "learning_rate": 1.9962234857706768e-05, "loss": 0.0012, "num_tokens": 11454501.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.25032281866814243, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.045521254651248455, "learning_rate": 1.996195477571565e-05, "loss": 0.0018, "num_tokens": 11467267.0, "reward": 1.5787036418914795, "reward_std": 0.4952993094921112, "rewards/fixed_code_pass_all_test_reward/mean": 0.5787037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.4952993094921112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2505072864785095, "frac_reward_zero_std": 1.0, "grad_norm": 1.046875, "kl": 0.10953079350292683, "learning_rate": 1.9961673660935304e-05, "loss": 0.0044, "num_tokens": 11474400.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.2506917542888766, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.045237138867378235, "learning_rate": 1.9961391513394886e-05, "loss": 0.0018, "num_tokens": 11479340.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2508762220992437, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.04525521257892251, "learning_rate": 1.9961108333123634e-05, "loss": 0.0018, "num_tokens": 11488115.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2510606899096108, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.027870766120031476, "learning_rate": 1.9960824120150918e-05, "loss": 0.0011, "num_tokens": 11492923.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 487.375, "completions/mean_terminated_length": 487.375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.25124515771997785, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.038976195035502315, "learning_rate": 1.9960538874506194e-05, "loss": 0.0016, "num_tokens": 11502254.0, "reward": 0.44999998807907104, "reward_std": 0.8332380652427673, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.37032803893089294, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.251429625530345, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.08797367615625262, "learning_rate": 1.9960252596219042e-05, "loss": 0.0035, "num_tokens": 11510997.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 284.0, "completions/mean_terminated_length": 284.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.25161409334071205, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05915744509547949, "learning_rate": 1.9959965285319135e-05, "loss": 0.0024, "num_tokens": 11520493.0, "reward": 1.3445945978164673, "reward_std": 0.26745641231536865, "rewards/fixed_code_pass_all_test_reward/mean": 0.3445945680141449, "rewards/fixed_code_pass_all_test_reward/std": 0.26745641231536865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 481.0, "completions/mean_terminated_length": 257.14288330078125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2517985611510791, "frac_reward_zero_std": 0.0, "grad_norm": 0.62890625, "kl": 0.06046892097219825, "learning_rate": 1.9959676941836262e-05, "loss": 0.0024, "num_tokens": 11527885.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.25198302896144625, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.04656099248677492, "learning_rate": 1.9959387565800314e-05, "loss": 0.0019, "num_tokens": 11536924.0, "reward": 1.90625, "reward_std": 0.1735912710428238, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.1735912710428238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2521674967718133, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.050233818124979734, "learning_rate": 1.99590971572413e-05, "loss": 0.002, "num_tokens": 11540979.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 514.375, "completions/mean_terminated_length": 514.375, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.2523519645821804, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.03871607221662998, "learning_rate": 1.995880571618932e-05, "loss": 0.0015, "num_tokens": 11552446.0, "reward": 1.1184210777282715, "reward_std": 0.07309107482433319, "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.073091059923172, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2525364323925475, "frac_reward_zero_std": 1.0, "grad_norm": 0.392578125, "kl": 0.060727344709448516, "learning_rate": 1.9958513242674588e-05, "loss": 0.0024, "num_tokens": 11556667.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.2527209002029146, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.06769494875334203, "learning_rate": 1.9958219736727428e-05, "loss": 0.0027, "num_tokens": 11563136.0, "reward": 1.7037036418914795, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7037037014961243, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2529053680132817, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.06499261315912008, "learning_rate": 1.9957925198378273e-05, "loss": 0.0026, "num_tokens": 11572094.0, "reward": 1.101694941520691, "reward_std": 0.04707556217908859, "rewards/fixed_code_pass_all_test_reward/mean": 0.10169491171836853, "rewards/fixed_code_pass_all_test_reward/std": 0.04707559570670128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.25308983582364875, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.059196391608566046, "learning_rate": 1.9957629627657654e-05, "loss": 0.0024, "num_tokens": 11582181.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2532743036340159, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.046248845756053925, "learning_rate": 1.9957333024596214e-05, "loss": 0.0018, "num_tokens": 11588915.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.25345877144438295, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.04817272152286023, "learning_rate": 1.9957035389224702e-05, "loss": 0.0019, "num_tokens": 11592844.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.25364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.054975228384137154, "learning_rate": 1.9956736721573974e-05, "loss": 0.0022, "num_tokens": 11598226.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.25382770706511715, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.07083105435594916, "learning_rate": 1.9956437021675003e-05, "loss": 0.0028, "num_tokens": 11605907.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.2540121748754842, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.031956019112840295, "learning_rate": 1.995613628955885e-05, "loss": 0.0013, "num_tokens": 11611577.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 601.75, "completions/mean_terminated_length": 601.75, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.2541966426858513, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.019758016103878617, "learning_rate": 1.9955834525256694e-05, "loss": 0.0008, "num_tokens": 11623031.0, "reward": 1.8256173133850098, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.2543811104962184, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.06361814518459141, "learning_rate": 1.9955531728799823e-05, "loss": 0.0025, "num_tokens": 11627723.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.09679154166951776, "learning_rate": 1.9955227900219625e-05, "loss": 0.0039, "num_tokens": 11635456.0, "reward": 1.34375, "reward_std": 0.1388959437608719, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.13889597356319427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 437.875, "completions/mean_terminated_length": 437.875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.25475004611695257, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.03198201581835747, "learning_rate": 1.9954923039547606e-05, "loss": 0.0013, "num_tokens": 11648359.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.2549345139273197, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07485616812482476, "learning_rate": 1.9954617146815364e-05, "loss": 0.003, "num_tokens": 11653166.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.25511898173768677, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.06592679396271706, "learning_rate": 1.995431022205462e-05, "loss": 0.0026, "num_tokens": 11659560.0, "reward": 1.2887930870056152, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.4137931168079376, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 351.625, "completions/mean_terminated_length": 351.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.25530344954805384, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.08018231624737382, "learning_rate": 1.9954002265297188e-05, "loss": 0.0032, "num_tokens": 11670061.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.25548791735842097, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.044821570510976017, "learning_rate": 1.9953693276574993e-05, "loss": 0.0018, "num_tokens": 11679367.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.25567238516878804, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.07324380381032825, "learning_rate": 1.9953383255920076e-05, "loss": 0.0029, "num_tokens": 11688893.0, "reward": 1.0178570747375488, "reward_std": 0.05050762742757797, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 296.75, "completions/mean_terminated_length": 296.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2558568529791551, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.053011367097496986, "learning_rate": 1.995307220336457e-05, "loss": 0.0021, "num_tokens": 11694859.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.25604132078952224, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.025649540941230953, "learning_rate": 1.995276011894073e-05, "loss": 0.001, "num_tokens": 11702036.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 490.25, "completions/mean_terminated_length": 490.25, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.2562257885998893, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.06322388770058751, "learning_rate": 1.9952447002680908e-05, "loss": 0.0025, "num_tokens": 11712030.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.2564102564102564, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.09371439972892404, "learning_rate": 1.9952132854617567e-05, "loss": 0.0037, "num_tokens": 11719100.0, "reward": 1.7142857313156128, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2565947242206235, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0839359606616199, "learning_rate": 1.9951817674783272e-05, "loss": 0.0034, "num_tokens": 11725843.0, "reward": 1.7946428060531616, "reward_std": 0.19631260633468628, "rewards/fixed_code_pass_all_test_reward/mean": 0.7946428060531616, "rewards/fixed_code_pass_all_test_reward/std": 0.1963125616312027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.2567791920309906, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.06468530022539198, "learning_rate": 1.9951501463210704e-05, "loss": 0.0026, "num_tokens": 11730385.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.25696365984135766, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.07508175959810615, "learning_rate": 1.995118421993264e-05, "loss": 0.003, "num_tokens": 11736345.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2571481276517248, "frac_reward_zero_std": 1.0, "grad_norm": 0.84765625, "kl": 0.14588174037635326, "learning_rate": 1.995086594498197e-05, "loss": 0.0058, "num_tokens": 11743943.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 310.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.25733259546209186, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.054404240334406495, "learning_rate": 1.99505466383917e-05, "loss": 0.0022, "num_tokens": 11753287.0, "reward": 1.2544642686843872, "reward_std": 0.4078482389450073, "rewards/fixed_code_pass_all_test_reward/mean": 0.2544642686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.4078482687473297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.25751706327245893, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0746133872307837, "learning_rate": 1.9950226300194923e-05, "loss": 0.003, "num_tokens": 11759171.0, "reward": 1.5336538553237915, "reward_std": 0.488427996635437, "rewards/fixed_code_pass_all_test_reward/mean": 0.5336538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.488427996635437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.25770153108282606, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06124119693413377, "learning_rate": 1.9949904930424857e-05, "loss": 0.0025, "num_tokens": 11766997.0, "reward": 1.9598214626312256, "reward_std": 0.024798767641186714, "rewards/fixed_code_pass_all_test_reward/mean": 0.9598214626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.024798741564154625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.25788599889319314, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.05618661781772971, "learning_rate": 1.994958252911481e-05, "loss": 0.0022, "num_tokens": 11771621.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.2580704667035602, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.05733153177425265, "learning_rate": 1.9949259096298217e-05, "loss": 0.0023, "num_tokens": 11777750.0, "reward": 1.7777777910232544, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.25825493451392734, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.04668461834080517, "learning_rate": 1.99489346320086e-05, "loss": 0.0019, "num_tokens": 11782801.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 357.875, "completions/mean_terminated_length": 357.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2584394023242944, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05440469807945192, "learning_rate": 1.9948609136279607e-05, "loss": 0.0022, "num_tokens": 11790048.0, "reward": 1.6022727489471436, "reward_std": 0.32934948801994324, "rewards/fixed_code_pass_all_test_reward/mean": 0.6022727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.32934945821762085, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2586238701346615, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.05725490069016814, "learning_rate": 1.9948282609144975e-05, "loss": 0.0023, "num_tokens": 11800759.0, "reward": 1.98369562625885, "reward_std": 0.046115659177303314, "rewards/fixed_code_pass_all_test_reward/mean": 0.9836956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.04611567035317421, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 241.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2588083379450286, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.11624108068645, "learning_rate": 1.994795505063856e-05, "loss": 0.0047, "num_tokens": 11809727.0, "reward": 1.1458333730697632, "reward_std": 0.06076743081212044, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.060767434537410736, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2589928057553957, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.11473184637725353, "learning_rate": 1.994762646079432e-05, "loss": 0.0046, "num_tokens": 11818488.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.25917727356576276, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.06093479925766587, "learning_rate": 1.9947296839646322e-05, "loss": 0.0024, "num_tokens": 11826527.0, "reward": 1.942307710647583, "reward_std": 0.06818503141403198, "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.06818501651287079, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2593617413761299, "frac_reward_zero_std": 1.0, "grad_norm": 0.34765625, "kl": 0.0812570583075285, "learning_rate": 1.9946966187228736e-05, "loss": 0.0033, "num_tokens": 11830717.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 275.25, "completions/mean_terminated_length": 275.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.25954620918649696, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.14377528335899115, "learning_rate": 1.994663450357585e-05, "loss": 0.0058, "num_tokens": 11839911.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.25973067699686403, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03374645160511136, "learning_rate": 1.994630178872204e-05, "loss": 0.0013, "num_tokens": 11844688.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 380.125, "completions/mean_terminated_length": 380.125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.25991514480723116, "frac_reward_zero_std": 1.0, "grad_norm": 0.7109375, "kl": 0.16678453609347343, "learning_rate": 1.9945968042701805e-05, "loss": 0.0067, "num_tokens": 11854769.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.26009961261759823, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.03882792126387358, "learning_rate": 1.9945633265549743e-05, "loss": 0.0016, "num_tokens": 11860119.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.2602840804279653, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.030256236786954105, "learning_rate": 1.9945297457300568e-05, "loss": 0.0012, "num_tokens": 11866196.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.26046854823833243, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.04119424242526293, "learning_rate": 1.994496061798909e-05, "loss": 0.0016, "num_tokens": 11871256.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.2606530160486995, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.04438162315636873, "learning_rate": 1.9944622747650225e-05, "loss": 0.0018, "num_tokens": 11875725.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2608374838590666, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.05461907386779785, "learning_rate": 1.9944283846319012e-05, "loss": 0.0022, "num_tokens": 11885060.0, "reward": 1.4090908765792847, "reward_std": 0.7256475687026978, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.49896588921546936, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.2610219516694337, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.09730295138433576, "learning_rate": 1.994394391403058e-05, "loss": 0.0039, "num_tokens": 11891956.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2612064194798008, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.10154067352414131, "learning_rate": 1.9943602950820167e-05, "loss": 0.0041, "num_tokens": 11900364.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 450.75, "completions/mean_terminated_length": 450.75, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.26139088729016785, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.05503968754783273, "learning_rate": 1.994326095672313e-05, "loss": 0.0022, "num_tokens": 11909218.0, "reward": 1.451923131942749, "reward_std": 0.19905738532543182, "rewards/fixed_code_pass_all_test_reward/mean": 0.45192307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.19905738532543182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 332.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.261575355100535, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.03780889441259205, "learning_rate": 1.994291793177492e-05, "loss": 0.0015, "num_tokens": 11915950.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.26175982291090205, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.06218882976099849, "learning_rate": 1.99425738760111e-05, "loss": 0.0025, "num_tokens": 11921626.0, "reward": 1.4583333730697632, "reward_std": 0.7130230665206909, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4655483365058899, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.2619442907212691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.05333886807784438, "learning_rate": 1.9942228789467338e-05, "loss": 0.0021, "num_tokens": 11931477.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.26212875853163625, "frac_reward_zero_std": 1.0, "grad_norm": 0.71875, "kl": 0.18561644107103348, "learning_rate": 1.9941882672179415e-05, "loss": 0.0074, "num_tokens": 11939191.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2623132263420033, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.08270745910704136, "learning_rate": 1.994153552418321e-05, "loss": 0.0033, "num_tokens": 11943569.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2624976941523704, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.05387852247804403, "learning_rate": 1.9941187345514716e-05, "loss": 0.0022, "num_tokens": 11947911.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.2626821619627375, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.07131146313622594, "learning_rate": 1.9940838136210024e-05, "loss": 0.0029, "num_tokens": 11958677.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.2628666297731046, "frac_reward_zero_std": 1.0, "grad_norm": 0.11572265625, "kl": 0.11212560068815947, "learning_rate": 1.9940487896305346e-05, "loss": 0.0045, "num_tokens": 11964618.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.26305109758347167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0595703125, "kl": 0.028809632174670696, "learning_rate": 1.9940136625836986e-05, "loss": 0.0012, "num_tokens": 11970329.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2632355653938388, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.05921385670080781, "learning_rate": 1.9939784324841365e-05, "loss": 0.0024, "num_tokens": 11978872.0, "reward": 1.8486111164093018, "reward_std": 0.20820359885692596, "rewards/fixed_code_pass_all_test_reward/mean": 0.8486111164093018, "rewards/fixed_code_pass_all_test_reward/std": 0.20820365846157074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2634200332042059, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.0680231936275959, "learning_rate": 1.9939430993355005e-05, "loss": 0.0027, "num_tokens": 11987145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.26360450101457295, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.061316484585404396, "learning_rate": 1.993907663141454e-05, "loss": 0.0025, "num_tokens": 11994513.0, "reward": 1.4955357313156128, "reward_std": 0.2972404360771179, "rewards/fixed_code_pass_all_test_reward/mean": 0.4955357015132904, "rewards/fixed_code_pass_all_test_reward/std": 0.2972404360771179, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.2637889688249401, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.03558508423157036, "learning_rate": 1.9938721239056703e-05, "loss": 0.0014, "num_tokens": 12003370.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.26397343663530715, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.04458526871167123, "learning_rate": 1.993836481631834e-05, "loss": 0.0018, "num_tokens": 12009634.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2641579044456742, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.053767523262649775, "learning_rate": 1.9938007363236405e-05, "loss": 0.0022, "num_tokens": 12020829.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.26434237225604135, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.047831503208726645, "learning_rate": 1.993764887984796e-05, "loss": 0.0019, "num_tokens": 12027251.0, "reward": 1.7840908765792847, "reward_std": 0.40637627243995667, "rewards/fixed_code_pass_all_test_reward/mean": 0.7840908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.40637627243995667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2645268400664084, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.08048037346452475, "learning_rate": 1.993728936619016e-05, "loss": 0.0032, "num_tokens": 12034146.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2647113078767755, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.05898463330231607, "learning_rate": 1.9936928822300285e-05, "loss": 0.0024, "num_tokens": 12041662.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.2648957756871426, "frac_reward_zero_std": 1.0, "grad_norm": 0.392578125, "kl": 0.12772587314248085, "learning_rate": 1.9936567248215715e-05, "loss": 0.0051, "num_tokens": 12050557.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 296.375, "completions/mean_terminated_length": 296.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2650802434975097, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.04071061499416828, "learning_rate": 1.9936204643973927e-05, "loss": 0.0016, "num_tokens": 12057152.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 596.0, "completions/mean_terminated_length": 388.5714416503906, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.26526471130787677, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.06155144365038723, "learning_rate": 1.993584100961252e-05, "loss": 0.0025, "num_tokens": 12066696.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 445.625, "completions/mean_terminated_length": 445.625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.26544917911824384, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.04479340324178338, "learning_rate": 1.9935476345169192e-05, "loss": 0.0018, "num_tokens": 12079637.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.26563364692861097, "frac_reward_zero_std": 1.0, "grad_norm": 0.03564453125, "kl": 0.02971238805912435, "learning_rate": 1.9935110650681747e-05, "loss": 0.0012, "num_tokens": 12086665.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.26581811473897804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.0264615376945585, "learning_rate": 1.99347439261881e-05, "loss": 0.0011, "num_tokens": 12092524.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.2660025825493451, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.04142060875892639, "learning_rate": 1.993437617172627e-05, "loss": 0.0017, "num_tokens": 12100259.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.26618705035971224, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.09183123568072915, "learning_rate": 1.9934007387334386e-05, "loss": 0.0037, "num_tokens": 12104756.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2663715181700793, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.08403565781190991, "learning_rate": 1.9933637573050677e-05, "loss": 0.0034, "num_tokens": 12108791.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2665559859804464, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.0812339624390006, "learning_rate": 1.9933266728913482e-05, "loss": 0.0032, "num_tokens": 12113820.0, "reward": 1.4943182468414307, "reward_std": 0.21102647483348846, "rewards/fixed_code_pass_all_test_reward/mean": 0.4943181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.21102647483348846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 439.875, "completions/mean_terminated_length": 439.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.2667404537908135, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.06180181237868965, "learning_rate": 1.993289485496125e-05, "loss": 0.0025, "num_tokens": 12122643.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.2669249216011806, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.033306349301710725, "learning_rate": 1.993252195123254e-05, "loss": 0.0013, "num_tokens": 12129276.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.26710938941154766, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.038190876599401236, "learning_rate": 1.9932148017766e-05, "loss": 0.0015, "num_tokens": 12138504.0, "reward": 1.5855262279510498, "reward_std": 0.16747263073921204, "rewards/fixed_code_pass_all_test_reward/mean": 0.5855263471603394, "rewards/fixed_code_pass_all_test_reward/std": 0.16747266054153442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.2672938572219148, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.1057237982749939, "learning_rate": 1.9931773054600402e-05, "loss": 0.0042, "num_tokens": 12146317.0, "reward": 1.8236607313156128, "reward_std": 0.056821055710315704, "rewards/fixed_code_pass_all_test_reward/mean": 0.8236607313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.05682109668850899, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 390.875, "completions/mean_terminated_length": 390.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.26747832503228186, "frac_reward_zero_std": 1.0, "grad_norm": 0.31640625, "kl": 0.1133955866098404, "learning_rate": 1.9931397061774627e-05, "loss": 0.0045, "num_tokens": 12154412.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.26766279284264893, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.06177677889354527, "learning_rate": 1.9931020039327646e-05, "loss": 0.0025, "num_tokens": 12163417.0, "reward": 1.734375, "reward_std": 0.2868976593017578, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.2868976593017578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 641.625, "completions/mean_terminated_length": 641.625, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.26784726065301606, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.036903930245898664, "learning_rate": 1.9930641987298555e-05, "loss": 0.0015, "num_tokens": 12177750.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 287.0, "completions/mean_terminated_length": 287.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.26803172846338313, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.04377555625978857, "learning_rate": 1.9930262905726537e-05, "loss": 0.0018, "num_tokens": 12183606.0, "reward": 1.774999976158142, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 612.125, "completions/mean_terminated_length": 612.125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.2682161962737502, "frac_reward_zero_std": 0.0, "grad_norm": 0.66015625, "kl": 0.022061991039663553, "learning_rate": 1.99298827946509e-05, "loss": 0.0009, "num_tokens": 12195335.0, "reward": 1.3125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 281.5, "completions/mean_terminated_length": 281.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.26840066408411734, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.10417079832404852, "learning_rate": 1.992950165411105e-05, "loss": 0.0042, "num_tokens": 12203611.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.2685851318944844, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.06013980298303068, "learning_rate": 1.99291194841465e-05, "loss": 0.0024, "num_tokens": 12210569.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 455.0, "completions/mean_terminated_length": 455.0, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.2687695997048515, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.020456291851587594, "learning_rate": 1.992873628479687e-05, "loss": 0.0008, "num_tokens": 12219337.0, "reward": 1.4821429252624512, "reward_std": 0.27465853095054626, "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.27465856075286865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.2689540675152186, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.05422973190434277, "learning_rate": 1.992835205610189e-05, "loss": 0.0022, "num_tokens": 12223830.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2691385353255857, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.08057411620393395, "learning_rate": 1.9927966798101395e-05, "loss": 0.0032, "num_tokens": 12228023.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.26932300313595275, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.03069179120939225, "learning_rate": 1.9927580510835326e-05, "loss": 0.0012, "num_tokens": 12232448.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.2695074709463199, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.04510441329330206, "learning_rate": 1.9927193194343726e-05, "loss": 0.0018, "num_tokens": 12238798.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.26969193875668696, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.04545811784919351, "learning_rate": 1.9926804848666753e-05, "loss": 0.0018, "num_tokens": 12248806.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.26987640656705403, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.08558601979166269, "learning_rate": 1.992641547384467e-05, "loss": 0.0034, "num_tokens": 12258283.0, "reward": 1.990625023841858, "reward_std": 0.02651650831103325, "rewards/fixed_code_pass_all_test_reward/mean": 0.9906250238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.026516500860452652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.27006087437742116, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05573216360062361, "learning_rate": 1.992602506991784e-05, "loss": 0.0022, "num_tokens": 12265584.0, "reward": 1.9166667461395264, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.27024534218778823, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.06816479470580816, "learning_rate": 1.992563363692674e-05, "loss": 0.0027, "num_tokens": 12272964.0, "reward": 1.64130437374115, "reward_std": 0.3421509265899658, "rewards/fixed_code_pass_all_test_reward/mean": 0.6413043141365051, "rewards/fixed_code_pass_all_test_reward/std": 0.34215089678764343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 419.375, "completions/mean_terminated_length": 419.375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.2704298099981553, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.017630112823098898, "learning_rate": 1.9925241174911957e-05, "loss": 0.0007, "num_tokens": 12280991.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.27061427780852243, "frac_reward_zero_std": 1.0, "grad_norm": 0.2138671875, "kl": 0.09724992886185646, "learning_rate": 1.9924847683914166e-05, "loss": 0.0039, "num_tokens": 12288890.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2707987456188895, "frac_reward_zero_std": 1.0, "grad_norm": 0.232421875, "kl": 0.0600176730658859, "learning_rate": 1.9924453163974168e-05, "loss": 0.0024, "num_tokens": 12292700.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2709832134292566, "frac_reward_zero_std": 0.0, "grad_norm": 4.875, "kl": 0.08349588373675942, "learning_rate": 1.992405761513287e-05, "loss": 0.0033, "num_tokens": 12298024.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2711676812396237, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.08198619168251753, "learning_rate": 1.992366103743127e-05, "loss": 0.0033, "num_tokens": 12305920.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2713521490499908, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.04752695420756936, "learning_rate": 1.992326343091049e-05, "loss": 0.0019, "num_tokens": 12315452.0, "reward": 1.75, "reward_std": 0.3450327515602112, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.27153661686035785, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.032079927856102586, "learning_rate": 1.9922864795611746e-05, "loss": 0.0013, "num_tokens": 12327045.0, "reward": 1.543269157409668, "reward_std": 0.20397306978702545, "rewards/fixed_code_pass_all_test_reward/mean": 0.5432692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.20397311449050903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.271721084670725, "frac_reward_zero_std": 1.0, "grad_norm": 1.0390625, "kl": 0.12552731623873115, "learning_rate": 1.9922465131576372e-05, "loss": 0.005, "num_tokens": 12333333.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.27190555248109205, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.04311150568537414, "learning_rate": 1.9922064438845793e-05, "loss": 0.0017, "num_tokens": 12337620.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2720900202914591, "frac_reward_zero_std": 1.0, "grad_norm": 0.03662109375, "kl": 0.017899402882903814, "learning_rate": 1.992166271746156e-05, "loss": 0.0007, "num_tokens": 12346079.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.27227448810182625, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.032958275405690074, "learning_rate": 1.9921259967465318e-05, "loss": 0.0013, "num_tokens": 12351572.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.2724589559121933, "frac_reward_zero_std": 1.0, "grad_norm": 0.1767578125, "kl": 0.06398858223110437, "learning_rate": 1.9920856188898817e-05, "loss": 0.0026, "num_tokens": 12355498.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.2726434237225604, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.036508833640255034, "learning_rate": 1.9920451381803922e-05, "loss": 0.0015, "num_tokens": 12366130.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2728278915329275, "frac_reward_zero_std": 0.0, "grad_norm": 5.21875, "kl": 0.040717258816584945, "learning_rate": 1.9920045546222598e-05, "loss": 0.0016, "num_tokens": 12374760.0, "reward": 1.3977272510528564, "reward_std": 0.8637218475341797, "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.4019947648048401, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.2730123593432946, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.04609573679044843, "learning_rate": 1.9919638682196926e-05, "loss": 0.0018, "num_tokens": 12380570.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.27319682715366167, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.0416278587654233, "learning_rate": 1.9919230789769078e-05, "loss": 0.0017, "num_tokens": 12389980.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2733812949640288, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.050751710776239634, "learning_rate": 1.9918821868981347e-05, "loss": 0.002, "num_tokens": 12394122.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.27356576277439587, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.07614199840463698, "learning_rate": 1.9918411919876126e-05, "loss": 0.003, "num_tokens": 12401571.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 420.75, "completions/mean_terminated_length": 420.75, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.27375023058476294, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.11692187888547778, "learning_rate": 1.9918000942495913e-05, "loss": 0.0047, "num_tokens": 12410729.0, "reward": 1.7149999141693115, "reward_std": 0.4087262749671936, "rewards/fixed_code_pass_all_test_reward/mean": 0.7149999737739563, "rewards/fixed_code_pass_all_test_reward/std": 0.4087262451648712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.27393469839513007, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.052711405558511615, "learning_rate": 1.9917588936883323e-05, "loss": 0.0021, "num_tokens": 12418948.0, "reward": 1.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.27411916620549714, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.055742616998031735, "learning_rate": 1.991717590308106e-05, "loss": 0.0022, "num_tokens": 12423949.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2743036340158642, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.03949356428347528, "learning_rate": 1.9916761841131952e-05, "loss": 0.0016, "num_tokens": 12433415.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.27448810182623135, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.06568608293309808, "learning_rate": 1.9916346751078924e-05, "loss": 0.0026, "num_tokens": 12441234.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.2746725696365984, "frac_reward_zero_std": 1.0, "grad_norm": 0.4765625, "kl": 0.12739248387515545, "learning_rate": 1.991593063296501e-05, "loss": 0.0051, "num_tokens": 12450870.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.2748570374469655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.0613123734947294, "learning_rate": 1.991551348683335e-05, "loss": 0.0025, "num_tokens": 12456810.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2750415052573326, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.0737862482201308, "learning_rate": 1.991509531272719e-05, "loss": 0.003, "num_tokens": 12462057.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.2752259730676997, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.055398568976670504, "learning_rate": 1.9914676110689888e-05, "loss": 0.0022, "num_tokens": 12470612.0, "reward": 1.6153846979141235, "reward_std": 0.26004746556282043, "rewards/fixed_code_pass_all_test_reward/mean": 0.6153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.26004746556282043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 282.0, "completions/mean_terminated_length": 282.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.27541044087806676, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.059104107320308685, "learning_rate": 1.99142558807649e-05, "loss": 0.0024, "num_tokens": 12477164.0, "reward": 1.8461538553237915, "reward_std": 0.28486770391464233, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.28486770391464233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2755949086884339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.048838973976671696, "learning_rate": 1.9913834622995787e-05, "loss": 0.002, "num_tokens": 12484795.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.27577937649880097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.04624909209087491, "learning_rate": 1.9913412337426235e-05, "loss": 0.0018, "num_tokens": 12489499.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 326.75, "completions/mean_terminated_length": 326.75, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.27596384430916804, "frac_reward_zero_std": 1.0, "grad_norm": 0.052001953125, "kl": 0.028489691205322742, "learning_rate": 1.9912989024100016e-05, "loss": 0.0011, "num_tokens": 12496433.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 408.125, "completions/mean_terminated_length": 408.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.27614831211953517, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.03229889879003167, "learning_rate": 1.9912564683061014e-05, "loss": 0.0013, "num_tokens": 12504722.0, "reward": 1.1750000715255737, "reward_std": 0.04629099741578102, "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.04629100486636162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.27633277992990224, "frac_reward_zero_std": 1.0, "grad_norm": 4.5625, "kl": 0.4583804849535227, "learning_rate": 1.991213931435323e-05, "loss": 0.0183, "num_tokens": 12512622.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.2765172477402693, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.052784796338528395, "learning_rate": 1.9911712918020756e-05, "loss": 0.0021, "num_tokens": 12517170.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.27670171555063644, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.1051879683509469, "learning_rate": 1.99112854941078e-05, "loss": 0.0042, "num_tokens": 12527653.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.2768861833610035, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.044017903972417116, "learning_rate": 1.9910857042658675e-05, "loss": 0.0018, "num_tokens": 12531830.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2770706511713706, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04962373268790543, "learning_rate": 1.9910427563717803e-05, "loss": 0.002, "num_tokens": 12537544.0, "reward": 1.7916667461395264, "reward_std": 0.2480079084634781, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.24800792336463928, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2772551189817377, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.06979273306205869, "learning_rate": 1.9909997057329703e-05, "loss": 0.0028, "num_tokens": 12545959.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2774395867921048, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.0593261937610805, "learning_rate": 1.9909565523539017e-05, "loss": 0.0024, "num_tokens": 12553764.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.27762405460247186, "frac_reward_zero_std": 1.0, "grad_norm": 0.28125, "kl": 0.16169606428593397, "learning_rate": 1.9909132962390472e-05, "loss": 0.0065, "num_tokens": 12564800.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.27780852241283893, "frac_reward_zero_std": 0.0, "grad_norm": 3.921875, "kl": 0.13693272415548563, "learning_rate": 1.990869937392892e-05, "loss": 0.0055, "num_tokens": 12573911.0, "reward": 0.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.27799299022320606, "frac_reward_zero_std": 1.0, "grad_norm": 0.2236328125, "kl": 0.06877797958441079, "learning_rate": 1.990826475819931e-05, "loss": 0.0028, "num_tokens": 12581196.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.27817745803357313, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.09001912456005812, "learning_rate": 1.99078291152467e-05, "loss": 0.0036, "num_tokens": 12588807.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.2783619258439402, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.03797531977761537, "learning_rate": 1.9907392445116258e-05, "loss": 0.0015, "num_tokens": 12598224.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.27854639365430733, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.032905344502069056, "learning_rate": 1.990695474785325e-05, "loss": 0.0013, "num_tokens": 12603452.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2787308614646744, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.047405785182490945, "learning_rate": 1.9906516023503057e-05, "loss": 0.0019, "num_tokens": 12608493.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 421.0, "completions/mean_terminated_length": 421.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.2789153292750415, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.06798795820213854, "learning_rate": 1.9906076272111164e-05, "loss": 0.0027, "num_tokens": 12618581.0, "reward": 1.109195351600647, "reward_std": 0.11345508694648743, "rewards/fixed_code_pass_all_test_reward/mean": 0.10919541120529175, "rewards/fixed_code_pass_all_test_reward/std": 0.11345507204532623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 370.25, "completions/mean_terminated_length": 370.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.2790997970854086, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.07963244523853064, "learning_rate": 1.9905635493723156e-05, "loss": 0.0032, "num_tokens": 12625431.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 420.625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.2792842648957757, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.04991452815011144, "learning_rate": 1.9905193688384735e-05, "loss": 0.002, "num_tokens": 12635172.0, "reward": 1.25, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.27946873270614275, "frac_reward_zero_std": 1.0, "grad_norm": 0.060791015625, "kl": 0.033029105281457305, "learning_rate": 1.9904750856141705e-05, "loss": 0.0013, "num_tokens": 12640413.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.2796532005165099, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.044434914947487414, "learning_rate": 1.9904306997039973e-05, "loss": 0.0018, "num_tokens": 12647919.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.27983766832687695, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.051305771339684725, "learning_rate": 1.9903862111125556e-05, "loss": 0.0021, "num_tokens": 12652075.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.280022136137244, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.05507510039024055, "learning_rate": 1.9903416198444577e-05, "loss": 0.0022, "num_tokens": 12655896.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.28020660394761115, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.08361389115452766, "learning_rate": 1.9902969259043266e-05, "loss": 0.0033, "num_tokens": 12664859.0, "reward": 1.6728723049163818, "reward_std": 0.46669280529022217, "rewards/fixed_code_pass_all_test_reward/mean": 0.6728723049163818, "rewards/fixed_code_pass_all_test_reward/std": 0.46669283509254456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 334.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.2803910717579782, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.08538082288578153, "learning_rate": 1.9902521292967956e-05, "loss": 0.0034, "num_tokens": 12673546.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.2805755395683453, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.09065414126962423, "learning_rate": 1.9902072300265093e-05, "loss": 0.0036, "num_tokens": 12683890.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.28076000737871243, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.04352068458683789, "learning_rate": 1.9901622280981225e-05, "loss": 0.0017, "num_tokens": 12688800.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 317.125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.2809444751890795, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.04294374352321029, "learning_rate": 1.9901171235163006e-05, "loss": 0.0017, "num_tokens": 12695713.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.2811289429994466, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.10341498721390963, "learning_rate": 1.9900719162857195e-05, "loss": 0.0041, "num_tokens": 12706752.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.2813134108098137, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.03259825287386775, "learning_rate": 1.9900266064110664e-05, "loss": 0.0013, "num_tokens": 12713649.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 442.25, "completions/mean_terminated_length": 442.25, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.2814978786201808, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.06894711637869477, "learning_rate": 1.9899811938970383e-05, "loss": 0.0028, "num_tokens": 12722099.0, "reward": 1.1875, "reward_std": 0.752970278263092, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.28168234643054785, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.034998948336578906, "learning_rate": 1.989935678748344e-05, "loss": 0.0014, "num_tokens": 12726919.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.281866814240915, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03867588611319661, "learning_rate": 1.989890060969701e-05, "loss": 0.0015, "num_tokens": 12734230.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.28205128205128205, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.024953163811005652, "learning_rate": 1.98984434056584e-05, "loss": 0.001, "num_tokens": 12739538.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2822357498616491, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.05601682420819998, "learning_rate": 1.9897985175414998e-05, "loss": 0.0022, "num_tokens": 12746713.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.28242021767201625, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.01990097528323531, "learning_rate": 1.9897525919014318e-05, "loss": 0.0008, "num_tokens": 12756225.0, "reward": 1.9166667461395264, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.2826046854823833, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.04775161948055029, "learning_rate": 1.9897065636503973e-05, "loss": 0.0019, "num_tokens": 12760950.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 434.5, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.2827891532927504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.05505303223617375, "learning_rate": 1.9896604327931675e-05, "loss": 0.0022, "num_tokens": 12770234.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.2829736211031175, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.031462120881769806, "learning_rate": 1.9896141993345254e-05, "loss": 0.0013, "num_tokens": 12776021.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 430.875, "completions/mean_terminated_length": 430.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.2831580889134846, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.05828576651401818, "learning_rate": 1.9895678632792646e-05, "loss": 0.0023, "num_tokens": 12786076.0, "reward": 1.419471025466919, "reward_std": 0.44844457507133484, "rewards/fixed_code_pass_all_test_reward/mean": 0.5444711446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.45415154099464417, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.28334255672385167, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.01509854116011411, "learning_rate": 1.989521424632188e-05, "loss": 0.0006, "num_tokens": 12792161.0, "reward": 1.7000000476837158, "reward_std": 0.32071349024772644, "rewards/fixed_code_pass_all_test_reward/mean": 0.699999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.32071349024772644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2835270245342188, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.06230771308764815, "learning_rate": 1.9894748833981107e-05, "loss": 0.0025, "num_tokens": 12815175.0, "reward": 1.6731927394866943, "reward_std": 0.30871447920799255, "rewards/fixed_code_pass_all_test_reward/mean": 0.6731927394866943, "rewards/fixed_code_pass_all_test_reward/std": 0.30871447920799255, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.28371149234458587, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.04396213870495558, "learning_rate": 1.989428239581858e-05, "loss": 0.0018, "num_tokens": 12819597.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 498.625, "completions/mean_terminated_length": 498.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.28389596015495294, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.04362081678118557, "learning_rate": 1.9893814931882647e-05, "loss": 0.0017, "num_tokens": 12830170.0, "reward": 1.504807710647583, "reward_std": 0.6686668395996094, "rewards/fixed_code_pass_all_test_reward/mean": 0.629807710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.3770548105239868, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.28408042796532007, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.03300483478233218, "learning_rate": 1.989334644222178e-05, "loss": 0.0013, "num_tokens": 12834745.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 488.375, "completions/mean_terminated_length": 488.375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.28426489577568714, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.06385804596357048, "learning_rate": 1.9892876926884544e-05, "loss": 0.0026, "num_tokens": 12848596.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 511.75, "completions/mean_terminated_length": 511.75, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.2844493635860542, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.048254032619297504, "learning_rate": 1.9892406385919618e-05, "loss": 0.0019, "num_tokens": 12863626.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.28463383139642134, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.0624229870736599, "learning_rate": 1.989193481937578e-05, "loss": 0.0025, "num_tokens": 12871020.0, "reward": 1.85326087474823, "reward_std": 0.18583010137081146, "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.18583005666732788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 358.0, "completions/mean_terminated_length": 358.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2848182992067884, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.0365498848259449, "learning_rate": 1.989146222730193e-05, "loss": 0.0015, "num_tokens": 12881876.0, "reward": 1.6328125, "reward_std": 0.7368168234825134, "rewards/fixed_code_pass_all_test_reward/mean": 0.7578125, "rewards/fixed_code_pass_all_test_reward/std": 0.4487551152706146, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 402.0, "completions/mean_terminated_length": 402.0, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.2850027670171555, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.044092579977586865, "learning_rate": 1.989098860974705e-05, "loss": 0.0018, "num_tokens": 12889892.0, "reward": 1.295454502105713, "reward_std": 0.06428244709968567, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2851872348275226, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.03914031758904457, "learning_rate": 1.9890513966760246e-05, "loss": 0.0016, "num_tokens": 12897500.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 480.0, "completions/mean_terminated_length": 480.0, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.2853717026378897, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.03119426465127617, "learning_rate": 1.989003829839073e-05, "loss": 0.0012, "num_tokens": 12908380.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.28555617044825676, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04854785185307264, "learning_rate": 1.9889561604687812e-05, "loss": 0.0019, "num_tokens": 12917580.0, "reward": 1.650240421295166, "reward_std": 0.34072890877723694, "rewards/fixed_code_pass_all_test_reward/mean": 0.650240421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.34072890877723694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2857406382586239, "frac_reward_zero_std": 1.0, "grad_norm": 0.054443359375, "kl": 0.02883279079105705, "learning_rate": 1.9889083885700912e-05, "loss": 0.0012, "num_tokens": 12921745.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.28592510606899096, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.048334666062146425, "learning_rate": 1.9888605141479562e-05, "loss": 0.0019, "num_tokens": 12932194.0, "reward": 1.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 633.5, "completions/mean_terminated_length": 633.5, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.28610957387935804, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.02496644592611119, "learning_rate": 1.988812537207339e-05, "loss": 0.001, "num_tokens": 12951134.0, "reward": 1.8333332538604736, "reward_std": 0.34503278136253357, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.28629404168972516, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.048403201857581735, "learning_rate": 1.9887644577532135e-05, "loss": 0.0019, "num_tokens": 12957676.0, "reward": 1.3421052694320679, "reward_std": 0.3107365071773529, "rewards/fixed_code_pass_all_test_reward/mean": 0.34210526943206787, "rewards/fixed_code_pass_all_test_reward/std": 0.3107365071773529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.28647850950009224, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.051626923494040966, "learning_rate": 1.9887162757905644e-05, "loss": 0.0021, "num_tokens": 12962156.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 632.5, "completions/mean_terminated_length": 632.5, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.2866629773104593, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.032839380437508225, "learning_rate": 1.9886679913243873e-05, "loss": 0.0013, "num_tokens": 12973320.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.28684744512082644, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.05341592035256326, "learning_rate": 1.9886196043596872e-05, "loss": 0.0021, "num_tokens": 12983877.0, "reward": 1.90625, "reward_std": 0.13567514717578888, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.13567513227462769, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 409.625, "completions/mean_terminated_length": 409.625, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.2870319129311935, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.05163940321654081, "learning_rate": 1.9885711149014812e-05, "loss": 0.0021, "num_tokens": 12992330.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2872163807415606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.060778662795200944, "learning_rate": 1.9885225229547957e-05, "loss": 0.0024, "num_tokens": 13001833.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.2874008485519277, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.04718358791433275, "learning_rate": 1.9884738285246694e-05, "loss": 0.0019, "num_tokens": 13011285.0, "reward": 1.8365384340286255, "reward_std": 0.3088919222354889, "rewards/fixed_code_pass_all_test_reward/mean": 0.8365384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.3088919222354889, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.2875853163622948, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.051910422393120825, "learning_rate": 1.9884250316161494e-05, "loss": 0.0021, "num_tokens": 13023769.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.28776978417266186, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.06802480923943222, "learning_rate": 1.988376132234296e-05, "loss": 0.0027, "num_tokens": 13032137.0, "reward": 1.33984375, "reward_std": 0.2861368656158447, "rewards/fixed_code_pass_all_test_reward/mean": 0.33984375, "rewards/fixed_code_pass_all_test_reward/std": 0.2861368656158447, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.287954251983029, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.036979639902710915, "learning_rate": 1.9883271303841774e-05, "loss": 0.0015, "num_tokens": 13036539.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.28813871979339606, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.04171308968216181, "learning_rate": 1.9882780260708746e-05, "loss": 0.0017, "num_tokens": 13040803.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 387.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.28832318760376313, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.03706836479250342, "learning_rate": 1.988228819299478e-05, "loss": 0.0015, "num_tokens": 13048543.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.28850765541413026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.02440270478837192, "learning_rate": 1.9881795100750896e-05, "loss": 0.001, "num_tokens": 13055014.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 504.75, "completions/mean_terminated_length": 504.75, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.28869212322449733, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.044012056197971106, "learning_rate": 1.9881300984028213e-05, "loss": 0.0018, "num_tokens": 13068444.0, "reward": 1.115384578704834, "reward_std": 0.5577396750450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.24038462340831757, "rewards/fixed_code_pass_all_test_reward/std": 0.3426254987716675, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2888765910348644, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.05059381015598774, "learning_rate": 1.988080584287795e-05, "loss": 0.002, "num_tokens": 13075460.0, "reward": 1.8611111640930176, "reward_std": 0.19168488681316376, "rewards/fixed_code_pass_all_test_reward/mean": 0.8611111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.19168488681316376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.28906105884523153, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.040906468871980906, "learning_rate": 1.988030967735145e-05, "loss": 0.0016, "num_tokens": 13087800.0, "reward": 1.3185484409332275, "reward_std": 0.17107422649860382, "rewards/fixed_code_pass_all_test_reward/mean": 0.31854838132858276, "rewards/fixed_code_pass_all_test_reward/std": 0.17107422649860382, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.2892455266555986, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.08860543975606561, "learning_rate": 1.9879812487500145e-05, "loss": 0.0035, "num_tokens": 13099297.0, "reward": 1.274999976158142, "reward_std": 0.4527692198753357, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.5014265775680542, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2894299944659657, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.054959756787866354, "learning_rate": 1.9879314273375584e-05, "loss": 0.0022, "num_tokens": 13105470.0, "reward": 1.84375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.2896144622763328, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.041789666283875704, "learning_rate": 1.9878815035029418e-05, "loss": 0.0017, "num_tokens": 13109600.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.2897989300866999, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04521609912626445, "learning_rate": 1.9878314772513405e-05, "loss": 0.0018, "num_tokens": 13121144.0, "reward": 1.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.28998339789706695, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.05744500714354217, "learning_rate": 1.987781348587941e-05, "loss": 0.0023, "num_tokens": 13128563.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 432.125, "completions/mean_terminated_length": 432.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.290167865707434, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.03745120466919616, "learning_rate": 1.9877311175179397e-05, "loss": 0.0015, "num_tokens": 13139388.0, "reward": 1.5625, "reward_std": 0.4955156147480011, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.29035233351780115, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.09419935662299395, "learning_rate": 1.987680784046545e-05, "loss": 0.0038, "num_tokens": 13146946.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.2905368013281682, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.03891092468984425, "learning_rate": 1.9876303481789745e-05, "loss": 0.0016, "num_tokens": 13151855.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.2907212691385353, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.10609187046065927, "learning_rate": 1.9875798099204575e-05, "loss": 0.0042, "num_tokens": 13161428.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2909057369489024, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.0602632244117558, "learning_rate": 1.9875291692762336e-05, "loss": 0.0024, "num_tokens": 13169465.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2910902047592695, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.070282943546772, "learning_rate": 1.9874784262515522e-05, "loss": 0.0028, "num_tokens": 13176810.0, "reward": 1.5875000953674316, "reward_std": 0.3482097089290619, "rewards/fixed_code_pass_all_test_reward/mean": 0.5874999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.3482097089290619, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 356.5, "completions/mean_terminated_length": 356.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.2912746725696366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.048758917255327106, "learning_rate": 1.9874275808516745e-05, "loss": 0.002, "num_tokens": 13186398.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 382.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.2914591403800037, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.04576763557270169, "learning_rate": 1.987376633081872e-05, "loss": 0.0018, "num_tokens": 13198649.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2916436081903708, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.06209050607867539, "learning_rate": 1.9873255829474258e-05, "loss": 0.0025, "num_tokens": 13210083.0, "reward": 1.0367647409439087, "reward_std": 0.08545470237731934, "rewards/fixed_code_pass_all_test_reward/mean": 0.036764707416296005, "rewards/fixed_code_pass_all_test_reward/std": 0.08545467257499695, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.29182807600073785, "frac_reward_zero_std": 1.0, "grad_norm": 0.2353515625, "kl": 0.06386336800642312, "learning_rate": 1.9872744304536294e-05, "loss": 0.0026, "num_tokens": 13214406.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.292012543811105, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.09755423618480563, "learning_rate": 1.9872231756057855e-05, "loss": 0.0039, "num_tokens": 13222841.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.29219701162147205, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0826618371065706, "learning_rate": 1.9871718184092078e-05, "loss": 0.0033, "num_tokens": 13231959.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 548.375, "completions/mean_terminated_length": 548.375, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.2923814794318391, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.042786670150235295, "learning_rate": 1.987120358869221e-05, "loss": 0.0017, "num_tokens": 13242650.0, "reward": 1.0178570747375488, "reward_std": 0.05050762742757797, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.29256594724220625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.05961371725425124, "learning_rate": 1.9870687969911597e-05, "loss": 0.0024, "num_tokens": 13251068.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.2927504150525733, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.07194199599325657, "learning_rate": 1.9870171327803694e-05, "loss": 0.0029, "num_tokens": 13260572.0, "reward": 1.5255101919174194, "reward_std": 0.3938828110694885, "rewards/fixed_code_pass_all_test_reward/mean": 0.5255101919174194, "rewards/fixed_code_pass_all_test_reward/std": 0.3938828408718109, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 767.75, "completions/mean_terminated_length": 767.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2929348828629404, "frac_reward_zero_std": 0.0, "grad_norm": 0.54296875, "kl": 0.04116517829243094, "learning_rate": 1.986965366242207e-05, "loss": 0.0016, "num_tokens": 13275442.0, "reward": 1.0, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.2931193506733075, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05867267865687609, "learning_rate": 1.986913497382039e-05, "loss": 0.0023, "num_tokens": 13286532.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.2933038184836746, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.07635619444772601, "learning_rate": 1.986861526205242e-05, "loss": 0.0031, "num_tokens": 13296521.0, "reward": 1.9027777910232544, "reward_std": 0.2749859392642975, "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.29348828629404167, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.18401164468377829, "learning_rate": 1.986809452717205e-05, "loss": 0.0074, "num_tokens": 13305239.0, "reward": 1.4728260040283203, "reward_std": 0.33391231298446655, "rewards/fixed_code_pass_all_test_reward/mean": 0.5978260636329651, "rewards/fixed_code_pass_all_test_reward/std": 0.1639210432767868, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2936727541044088, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.2538868556730449, "learning_rate": 1.9867572769233262e-05, "loss": 0.0102, "num_tokens": 13312380.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.29385722191477587, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.08793225674889982, "learning_rate": 1.9867049988290154e-05, "loss": 0.0035, "num_tokens": 13321022.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 341.75, "completions/mean_terminated_length": 341.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.29404168972514294, "frac_reward_zero_std": 1.0, "grad_norm": 0.423828125, "kl": 0.10111156711354852, "learning_rate": 1.9866526184396916e-05, "loss": 0.004, "num_tokens": 13331196.0, "reward": 1.0714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.29422615753551007, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.061437806114554405, "learning_rate": 1.986600135760786e-05, "loss": 0.0025, "num_tokens": 13338418.0, "reward": 1.4800000190734863, "reward_std": 0.440778523683548, "rewards/fixed_code_pass_all_test_reward/mean": 0.47999998927116394, "rewards/fixed_code_pass_all_test_reward/std": 0.440778523683548, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 104.625, "completions/mean_terminated_length": 104.625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.29441062534587714, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.11072788201272488, "learning_rate": 1.986547550797739e-05, "loss": 0.0044, "num_tokens": 13342239.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2945950931562442, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.08745036227628589, "learning_rate": 1.986494863556003e-05, "loss": 0.0035, "num_tokens": 13350088.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.29477956096661134, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.0520627856021747, "learning_rate": 1.9864420740410395e-05, "loss": 0.0021, "num_tokens": 13359913.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2949640287769784, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.02589505910873413, "learning_rate": 1.986389182258322e-05, "loss": 0.001, "num_tokens": 13364973.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2951484965873455, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.11187402484938502, "learning_rate": 1.9863361882133332e-05, "loss": 0.0045, "num_tokens": 13375822.0, "reward": 1.2727272510528564, "reward_std": 0.16833092272281647, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2953329643977126, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06600933731533587, "learning_rate": 1.9862830919115683e-05, "loss": 0.0026, "num_tokens": 13380062.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2955174322080797, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.08473279420286417, "learning_rate": 1.9862298933585307e-05, "loss": 0.0034, "num_tokens": 13386206.0, "reward": 0.8914834856987, "reward_std": 0.787941038608551, "rewards/fixed_code_pass_all_test_reward/mean": 0.26648351550102234, "rewards/fixed_code_pass_all_test_reward/std": 0.35295820236206055, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.29570190001844676, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.07852287101559341, "learning_rate": 1.9861765925597365e-05, "loss": 0.0031, "num_tokens": 13389831.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2958863678288139, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.05218710470944643, "learning_rate": 1.9861231895207116e-05, "loss": 0.0021, "num_tokens": 13394178.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.29607083563918096, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.14273787289857864, "learning_rate": 1.9860696842469923e-05, "loss": 0.0057, "num_tokens": 13399235.0, "reward": 1.6124999523162842, "reward_std": 0.44860896468162537, "rewards/fixed_code_pass_all_test_reward/mean": 0.612500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.44860899448394775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.29625530344954804, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.032891144044697285, "learning_rate": 1.9860160767441253e-05, "loss": 0.0013, "num_tokens": 13407329.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 109.125, "completions/mean_terminated_length": 109.125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.29643977125991516, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.04199417436029762, "learning_rate": 1.9859623670176688e-05, "loss": 0.0017, "num_tokens": 13410930.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.29662423907028224, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.07756194565445185, "learning_rate": 1.9859085550731905e-05, "loss": 0.0031, "num_tokens": 13422893.0, "reward": 1.4362244606018066, "reward_std": 0.32797372341156006, "rewards/fixed_code_pass_all_test_reward/mean": 0.43622449040412903, "rewards/fixed_code_pass_all_test_reward/std": 0.32797375321388245, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 514.375, "completions/mean_terminated_length": 514.375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.2968087068806493, "frac_reward_zero_std": 0.0, "grad_norm": 0.6484375, "kl": 0.021326712623704225, "learning_rate": 1.9858546409162696e-05, "loss": 0.0009, "num_tokens": 13436080.0, "reward": 1.9629629850387573, "reward_std": 0.022859742864966393, "rewards/fixed_code_pass_all_test_reward/mean": 0.9629629850387573, "rewards/fixed_code_pass_all_test_reward/std": 0.022859742864966393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.29699317469101644, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.14404586819000542, "learning_rate": 1.985800624552496e-05, "loss": 0.0058, "num_tokens": 13446416.0, "reward": 1.435185194015503, "reward_std": 0.21413865685462952, "rewards/fixed_code_pass_all_test_reward/mean": 0.43518519401550293, "rewards/fixed_code_pass_all_test_reward/std": 0.2141387015581131, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2971776425013835, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.06881027901545167, "learning_rate": 1.985746505987469e-05, "loss": 0.0028, "num_tokens": 13453296.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2973621103117506, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.05991009762510657, "learning_rate": 1.9856922852268e-05, "loss": 0.0024, "num_tokens": 13461382.0, "reward": 0.75, "reward_std": 1.0350983142852783, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.5175492167472839, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.2975465781221177, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.09846128011122346, "learning_rate": 1.9856379622761094e-05, "loss": 0.0039, "num_tokens": 13469952.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2977310459324848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.0443333275616169, "learning_rate": 1.9855835371410296e-05, "loss": 0.0018, "num_tokens": 13481649.0, "reward": 1.3333333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.29791551374285186, "frac_reward_zero_std": 1.0, "grad_norm": 0.047607421875, "kl": 0.015118375478778034, "learning_rate": 1.9855290098272033e-05, "loss": 0.0006, "num_tokens": 13487248.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 111.125, "completions/mean_terminated_length": 111.125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.298099981553219, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.04992464650422335, "learning_rate": 1.9854743803402825e-05, "loss": 0.002, "num_tokens": 13490937.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.29828444936358606, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.04677146906033158, "learning_rate": 1.985419648685932e-05, "loss": 0.0019, "num_tokens": 13496086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 369.125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.29846891717395313, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.03819232643581927, "learning_rate": 1.9853648148698254e-05, "loss": 0.0015, "num_tokens": 13508607.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.29865338498432026, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.03640616801567376, "learning_rate": 1.985309878897647e-05, "loss": 0.0015, "num_tokens": 13513094.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.29883785279468733, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.08982761995866895, "learning_rate": 1.9852548407750935e-05, "loss": 0.0036, "num_tokens": 13519443.0, "reward": 1.0806450843811035, "reward_std": 0.08621328324079514, "rewards/fixed_code_pass_all_test_reward/mean": 0.08064515888690948, "rewards/fixed_code_pass_all_test_reward/std": 0.08621330559253693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 460.875, "completions/mean_terminated_length": 460.875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.2990223206050544, "frac_reward_zero_std": 0.0, "grad_norm": 0.76953125, "kl": 0.05403604102320969, "learning_rate": 1.98519970050787e-05, "loss": 0.0022, "num_tokens": 13531226.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.29920678841542153, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03963741788174957, "learning_rate": 1.985144458101693e-05, "loss": 0.0016, "num_tokens": 13535630.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 364.5, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.2993912562257886, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.06751695438288152, "learning_rate": 1.98508911356229e-05, "loss": 0.0027, "num_tokens": 13545666.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2995757240361557, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.10334598645567894, "learning_rate": 1.9850336668953988e-05, "loss": 0.0041, "num_tokens": 13553018.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2997601918465228, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.029492180794477463, "learning_rate": 1.9849781181067674e-05, "loss": 0.0012, "num_tokens": 13559422.0, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.2999446596568899, "frac_reward_zero_std": 0.0, "grad_norm": 0.76953125, "kl": 0.02903251990210265, "learning_rate": 1.984922467202155e-05, "loss": 0.0012, "num_tokens": 13565962.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.30012912746725695, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.10230041341856122, "learning_rate": 1.984866714187331e-05, "loss": 0.0041, "num_tokens": 13573203.0, "reward": 1.75, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3003135952776241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.07317360024899244, "learning_rate": 1.9848108590680756e-05, "loss": 0.0029, "num_tokens": 13580153.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.30049806308799115, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.10520786978304386, "learning_rate": 1.984754901850179e-05, "loss": 0.0042, "num_tokens": 13589159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.3006825308983582, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.14203880447894335, "learning_rate": 1.984698842539444e-05, "loss": 0.0057, "num_tokens": 13595272.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.30086699870872535, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.08123304788023233, "learning_rate": 1.9846426811416806e-05, "loss": 0.0032, "num_tokens": 13604206.0, "reward": 1.8055555820465088, "reward_std": 0.37912923097610474, "rewards/fixed_code_pass_all_test_reward/mean": 0.8055555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.37912923097610474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.3010514665190924, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.022659472131635994, "learning_rate": 1.984586417662712e-05, "loss": 0.0009, "num_tokens": 13609670.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3012359343294595, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.18599138129502535, "learning_rate": 1.9845300521083713e-05, "loss": 0.0074, "num_tokens": 13618520.0, "reward": 1.7580645084381104, "reward_std": 0.37301579117774963, "rewards/fixed_code_pass_all_test_reward/mean": 0.7580645084381104, "rewards/fixed_code_pass_all_test_reward/std": 0.37301579117774963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 384.875, "completions/mean_terminated_length": 384.875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3014204021398266, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.09427990880794823, "learning_rate": 1.9844735844845023e-05, "loss": 0.0038, "num_tokens": 13631015.0, "reward": 1.111111044883728, "reward_std": 0.5988426804542542, "rewards/fixed_code_pass_all_test_reward/mean": 0.2361111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.40761780738830566, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 394.25, "completions/mean_terminated_length": 394.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.3016048699501937, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.039595776004716754, "learning_rate": 1.9844170147969585e-05, "loss": 0.0016, "num_tokens": 13638561.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.30178933776056077, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.10102736484259367, "learning_rate": 1.9843603430516055e-05, "loss": 0.004, "num_tokens": 13646543.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 263.875, "completions/mean_terminated_length": 263.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3019738055709279, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.09597691288217902, "learning_rate": 1.984303569254318e-05, "loss": 0.0038, "num_tokens": 13657318.0, "reward": 1.774999976158142, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.302158273381295, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.08649599319323897, "learning_rate": 1.984246693410982e-05, "loss": 0.0035, "num_tokens": 13663503.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 613.125, "completions/mean_terminated_length": 613.125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.30234274119166205, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05350990151055157, "learning_rate": 1.9841897155274945e-05, "loss": 0.0021, "num_tokens": 13679336.0, "reward": 1.1484375, "reward_std": 0.639071524143219, "rewards/fixed_code_pass_all_test_reward/mean": 0.3984375, "rewards/fixed_code_pass_all_test_reward/std": 0.5021577477455139, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.3025272090020291, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.0798694253899157, "learning_rate": 1.9841326356097622e-05, "loss": 0.0032, "num_tokens": 13687820.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.30271167681239625, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.1079470282420516, "learning_rate": 1.9840754536637025e-05, "loss": 0.0043, "num_tokens": 13696299.0, "reward": 1.9298245906829834, "reward_std": 0.12993964552879333, "rewards/fixed_code_pass_all_test_reward/mean": 0.9298245906829834, "rewards/fixed_code_pass_all_test_reward/std": 0.12993967533111572, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.3028961446227633, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.06968524935655296, "learning_rate": 1.9840181696952446e-05, "loss": 0.0028, "num_tokens": 13700560.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3030806124331304, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.09412944735959172, "learning_rate": 1.9839607837103263e-05, "loss": 0.0038, "num_tokens": 13710270.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.3032650802434975, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.1541341943666339, "learning_rate": 1.9839032957148974e-05, "loss": 0.0062, "num_tokens": 13715110.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3034495480538646, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.06373753608204424, "learning_rate": 1.983845705714918e-05, "loss": 0.0025, "num_tokens": 13719530.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.30363401586423167, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.16125423833727837, "learning_rate": 1.9837880137163586e-05, "loss": 0.0065, "num_tokens": 13725151.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3038184836745988, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.09002807457000017, "learning_rate": 1.9837302197252e-05, "loss": 0.0036, "num_tokens": 13732742.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.30400295148496587, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.20437498856335878, "learning_rate": 1.9836723237474342e-05, "loss": 0.0082, "num_tokens": 13739018.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 502.0, "completions/mean_terminated_length": 502.0, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.30418741929533294, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.06580684497021139, "learning_rate": 1.983614325789063e-05, "loss": 0.0026, "num_tokens": 13748330.0, "reward": 1.600000023841858, "reward_std": 0.5014265179634094, "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.5014265179634094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.30437188710570007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.04742681514471769, "learning_rate": 1.9835562258561005e-05, "loss": 0.0019, "num_tokens": 13754404.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.30455635491606714, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06347104976885021, "learning_rate": 1.9834980239545686e-05, "loss": 0.0025, "num_tokens": 13759410.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3047408227264342, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "kl": 0.14658135455101728, "learning_rate": 1.9834397200905024e-05, "loss": 0.0059, "num_tokens": 13767562.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.30492529053680134, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.10572158545255661, "learning_rate": 1.983381314269946e-05, "loss": 0.0042, "num_tokens": 13778735.0, "reward": 1.024999976158142, "reward_std": 0.04629099741578102, "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, "rewards/fixed_code_pass_all_test_reward/std": 0.04629100486636162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 372.25, "completions/mean_terminated_length": 372.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3051097583471684, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.10880009178072214, "learning_rate": 1.9833228064989543e-05, "loss": 0.0044, "num_tokens": 13786089.0, "reward": 0.9879031777381897, "reward_std": 0.39051389694213867, "rewards/fixed_code_pass_all_test_reward/mean": 0.11290322244167328, "rewards/fixed_code_pass_all_test_reward/std": 0.06678053736686707, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 349.125, "completions/mean_terminated_length": 349.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3052942261575355, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.08194419136270881, "learning_rate": 1.983264196783593e-05, "loss": 0.0033, "num_tokens": 13793018.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3054786939679026, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.08314052177593112, "learning_rate": 1.9832054851299388e-05, "loss": 0.0033, "num_tokens": 13804123.0, "reward": 1.2050971984863281, "reward_std": 0.08628341555595398, "rewards/fixed_code_pass_all_test_reward/mean": 0.20509707927703857, "rewards/fixed_code_pass_all_test_reward/std": 0.08628340065479279, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.3056631617782697, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.06289996206760406, "learning_rate": 1.9831466715440787e-05, "loss": 0.0025, "num_tokens": 13808777.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.30584762958863676, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.10293893702328205, "learning_rate": 1.9830877560321094e-05, "loss": 0.0041, "num_tokens": 13817846.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 406.125, "completions/mean_terminated_length": 406.125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.3060320973990039, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.07124783424660563, "learning_rate": 1.983028738600139e-05, "loss": 0.0028, "num_tokens": 13825855.0, "reward": 1.4921875, "reward_std": 0.6716413497924805, "rewards/fixed_code_pass_all_test_reward/mean": 0.6171875, "rewards/fixed_code_pass_all_test_reward/std": 0.42017680406570435, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.30621656520937096, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.09433985035866499, "learning_rate": 1.9829696192542864e-05, "loss": 0.0038, "num_tokens": 13829428.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 509.375, "completions/mean_terminated_length": 509.375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.30640103301973803, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.054964892100542784, "learning_rate": 1.9829103980006808e-05, "loss": 0.0022, "num_tokens": 13838287.0, "reward": 1.3854167461395264, "reward_std": 0.5040483474731445, "rewards/fixed_code_pass_all_test_reward/mean": 0.6354166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.32443055510520935, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.30658550083010516, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.0928284740075469, "learning_rate": 1.982851074845461e-05, "loss": 0.0037, "num_tokens": 13861094.0, "reward": 1.966867446899414, "reward_std": 0.0937129408121109, "rewards/fixed_code_pass_all_test_reward/mean": 0.9668674468994141, "rewards/fixed_code_pass_all_test_reward/std": 0.0937129482626915, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 429.125, "completions/mean_terminated_length": 429.125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.30676996864047223, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.05304495617747307, "learning_rate": 1.9827916497947787e-05, "loss": 0.0021, "num_tokens": 13870103.0, "reward": 1.0214285850524902, "reward_std": 0.03967802971601486, "rewards/fixed_code_pass_all_test_reward/mean": 0.02142857201397419, "rewards/fixed_code_pass_all_test_reward/std": 0.03967800736427307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.3069544364508393, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.11310707870870829, "learning_rate": 1.982732122854793e-05, "loss": 0.0045, "num_tokens": 13874312.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.30713890426120644, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.06347531673964113, "learning_rate": 1.9826724940316767e-05, "loss": 0.0025, "num_tokens": 13878773.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 336.875, "completions/mean_terminated_length": 336.875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.3073233720715735, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.08976999670267105, "learning_rate": 1.9826127633316106e-05, "loss": 0.0036, "num_tokens": 13886052.0, "reward": 1.4943182468414307, "reward_std": 0.25271546840667725, "rewards/fixed_code_pass_all_test_reward/mean": 0.4943181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.25271546840667725, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3075078398819406, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.06499262992292643, "learning_rate": 1.9825529307607883e-05, "loss": 0.0026, "num_tokens": 13895134.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3076923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.07601966988295317, "learning_rate": 1.9824929963254118e-05, "loss": 0.003, "num_tokens": 13902727.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3078767755026748, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.16574146365746856, "learning_rate": 1.9824329600316948e-05, "loss": 0.0066, "num_tokens": 13910788.0, "reward": 1.6734694242477417, "reward_std": 0.37882933020591736, "rewards/fixed_code_pass_all_test_reward/mean": 0.6734694242477417, "rewards/fixed_code_pass_all_test_reward/std": 0.37882930040359497, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.30806124331304185, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.059343681670725346, "learning_rate": 1.9823728218858624e-05, "loss": 0.0024, "num_tokens": 13915184.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.308245711123409, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.08318747580051422, "learning_rate": 1.9823125818941484e-05, "loss": 0.0033, "num_tokens": 13921363.0, "reward": 1.8571429252624512, "reward_std": 0.23741397261619568, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.23741400241851807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 488.0, "completions/mean_terminated_length": 488.0, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.30843017893377606, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.08815860049799085, "learning_rate": 1.9822522400627985e-05, "loss": 0.0035, "num_tokens": 13935363.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 468.0, "completions/mean_terminated_length": 468.0, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.30861464674414313, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.07267849054187536, "learning_rate": 1.9821917963980686e-05, "loss": 0.0029, "num_tokens": 13948811.0, "reward": 1.3303570747375488, "reward_std": 0.358034610748291, "rewards/fixed_code_pass_all_test_reward/mean": 0.4553571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.39849844574928284, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 646.375, "completions/mean_terminated_length": 646.375, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.30879911455451026, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.05856925481930375, "learning_rate": 1.9821312509062247e-05, "loss": 0.0023, "num_tokens": 13965486.0, "reward": 1.03125, "reward_std": 0.4317220449447632, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.30898358236487733, "frac_reward_zero_std": 0.0, "grad_norm": 5.46875, "kl": 0.10472957044839859, "learning_rate": 1.982070603593544e-05, "loss": 0.0042, "num_tokens": 13969772.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.3091680501752444, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.09940497297793627, "learning_rate": 1.9820098544663142e-05, "loss": 0.004, "num_tokens": 13979868.0, "reward": 1.6530611515045166, "reward_std": 0.4158174395561218, "rewards/fixed_code_pass_all_test_reward/mean": 0.6530612707138062, "rewards/fixed_code_pass_all_test_reward/std": 0.4158174693584442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 498.0, "completions/mean_terminated_length": 498.0, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.30935251798561153, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.07028262037783861, "learning_rate": 1.981949003530833e-05, "loss": 0.0028, "num_tokens": 13990228.0, "reward": 0.8571428060531616, "reward_std": 0.3662113845348358, "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3095369857959786, "frac_reward_zero_std": 1.0, "grad_norm": 0.546875, "kl": 0.17270728386938572, "learning_rate": 1.9818880507934094e-05, "loss": 0.0069, "num_tokens": 13997500.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.3097214536063457, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.07210355903953314, "learning_rate": 1.981826996260362e-05, "loss": 0.0029, "num_tokens": 14001604.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3099059214167128, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.1321876011788845, "learning_rate": 1.9817658399380212e-05, "loss": 0.0053, "num_tokens": 14007437.0, "reward": 1.5, "reward_std": 0.3401506841182709, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.3401506841182709, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3100903892270799, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.0725417211651802, "learning_rate": 1.9817045818327268e-05, "loss": 0.0029, "num_tokens": 14017912.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.31027485703744695, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.07207641378045082, "learning_rate": 1.9816432219508297e-05, "loss": 0.0029, "num_tokens": 14022170.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3104593248478141, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.20165031217038631, "learning_rate": 1.9815817602986916e-05, "loss": 0.0081, "num_tokens": 14031580.0, "reward": 1.2678570747375488, "reward_std": 0.45456862449645996, "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.31064379265818115, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0709644271992147, "learning_rate": 1.981520196882684e-05, "loss": 0.0028, "num_tokens": 14042056.0, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3108282604685482, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.11232777498662472, "learning_rate": 1.98145853170919e-05, "loss": 0.0045, "num_tokens": 14050458.0, "reward": 1.8081395626068115, "reward_std": 0.3606526255607605, "rewards/fixed_code_pass_all_test_reward/mean": 0.8081395626068115, "rewards/fixed_code_pass_all_test_reward/std": 0.3606526255607605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.31101272827891535, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.12830605870112777, "learning_rate": 1.9813967647846018e-05, "loss": 0.0051, "num_tokens": 14058667.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3111971960892824, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.07878115074709058, "learning_rate": 1.981334896115324e-05, "loss": 0.0032, "num_tokens": 14070223.0, "reward": 1.423076868057251, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.42307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 486.25, "completions/mean_terminated_length": 486.25, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.3113816638996495, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.05654013575986028, "learning_rate": 1.9812729257077695e-05, "loss": 0.0023, "num_tokens": 14084353.0, "reward": 1.120192289352417, "reward_std": 0.4987039268016815, "rewards/fixed_code_pass_all_test_reward/mean": 0.24519230425357819, "rewards/fixed_code_pass_all_test_reward/std": 0.23162616789340973, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3115661317100166, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.030497669125907123, "learning_rate": 1.9812108535683637e-05, "loss": 0.0012, "num_tokens": 14089065.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 485.75, "completions/mean_terminated_length": 485.75, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.3117505995203837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.08446342032402754, "learning_rate": 1.981148679703542e-05, "loss": 0.0034, "num_tokens": 14101935.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.31193506733075077, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.08789316844195127, "learning_rate": 1.9810864041197502e-05, "loss": 0.0035, "num_tokens": 14108662.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 295.75, "completions/mean_terminated_length": 295.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.3121195351411179, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.0816650060005486, "learning_rate": 1.9810240268234438e-05, "loss": 0.0033, "num_tokens": 14118828.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.31230400295148497, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.08146433951333165, "learning_rate": 1.9809615478210907e-05, "loss": 0.0033, "num_tokens": 14128328.0, "reward": 1.2307692766189575, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.23076923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.31248847076185204, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.05591106438077986, "learning_rate": 1.9808989671191675e-05, "loss": 0.0022, "num_tokens": 14135369.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3126729385722192, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.08190474798902869, "learning_rate": 1.9808362847241627e-05, "loss": 0.0033, "num_tokens": 14145028.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.31285740638258625, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.060504299122840166, "learning_rate": 1.9807735006425747e-05, "loss": 0.0024, "num_tokens": 14153055.0, "reward": 1.5833332538604736, "reward_std": 0.28171810507774353, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.28171807527542114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3130418741929533, "frac_reward_zero_std": 0.0, "grad_norm": 2.96875, "kl": 0.09807438356801867, "learning_rate": 1.980710614880912e-05, "loss": 0.0039, "num_tokens": 14157100.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.31322634200332045, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.10653264913707972, "learning_rate": 1.980647627445695e-05, "loss": 0.0043, "num_tokens": 14165138.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3134108098136875, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.06363171525299549, "learning_rate": 1.980584538343453e-05, "loss": 0.0025, "num_tokens": 14173082.0, "reward": 1.7589285373687744, "reward_std": 0.4467855989933014, "rewards/fixed_code_pass_all_test_reward/mean": 0.7589285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.446785569190979, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3135952776240546, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.048664238303899765, "learning_rate": 1.9805213475807274e-05, "loss": 0.0019, "num_tokens": 14179672.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3137797454344217, "frac_reward_zero_std": 1.0, "grad_norm": 3.1875, "kl": 0.1719646283891052, "learning_rate": 1.9804580551640685e-05, "loss": 0.0069, "num_tokens": 14184546.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3139642132447888, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.06589964078739285, "learning_rate": 1.9803946611000394e-05, "loss": 0.0026, "num_tokens": 14191123.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 445.5, "completions/mean_terminated_length": 445.5, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.31414868105515587, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.051606345223262906, "learning_rate": 1.980331165395211e-05, "loss": 0.0021, "num_tokens": 14200471.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.314333148865523, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.08974823076277971, "learning_rate": 1.9802675680561667e-05, "loss": 0.0036, "num_tokens": 14208904.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.31451761667589007, "frac_reward_zero_std": 0.0, "grad_norm": 0.60546875, "kl": 0.04024562076665461, "learning_rate": 1.9802038690895e-05, "loss": 0.0016, "num_tokens": 14215961.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.31470208448625714, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.12559832073748112, "learning_rate": 1.980140068501815e-05, "loss": 0.005, "num_tokens": 14221079.0, "reward": 1.875, "reward_std": 0.2368176281452179, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2368176281452179, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.3148865522966242, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.12382021732628345, "learning_rate": 1.9800761662997254e-05, "loss": 0.005, "num_tokens": 14230035.0, "reward": 1.6538461446762085, "reward_std": 0.40703868865966797, "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.40703868865966797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 280.875, "completions/mean_terminated_length": 280.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.31507102010699134, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.082843370269984, "learning_rate": 1.980012162489856e-05, "loss": 0.0033, "num_tokens": 14239602.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3152554879173584, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.07852113293483853, "learning_rate": 1.9799480570788433e-05, "loss": 0.0031, "num_tokens": 14246878.0, "reward": 1.21875, "reward_std": 0.24775780737400055, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.24775780737400055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.3154399557277255, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.09031384671106935, "learning_rate": 1.9798838500733327e-05, "loss": 0.0036, "num_tokens": 14255015.0, "reward": 1.1931817531585693, "reward_std": 0.09436090290546417, "rewards/fixed_code_pass_all_test_reward/mean": 0.1931818276643753, "rewards/fixed_code_pass_all_test_reward/std": 0.09436088055372238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3156244235380926, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.10014249850064516, "learning_rate": 1.979819541479981e-05, "loss": 0.004, "num_tokens": 14263255.0, "reward": 1.6959459781646729, "reward_std": 0.20822589099407196, "rewards/fixed_code_pass_all_test_reward/mean": 0.6959459781646729, "rewards/fixed_code_pass_all_test_reward/std": 0.20822592079639435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3158088913484597, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.07781041180714965, "learning_rate": 1.979755131305455e-05, "loss": 0.0031, "num_tokens": 14272810.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.31599335915882676, "frac_reward_zero_std": 1.0, "grad_norm": 0.2490234375, "kl": 0.05364137451397255, "learning_rate": 1.979690619556433e-05, "loss": 0.0021, "num_tokens": 14277736.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.3161778269691939, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.06099038943648338, "learning_rate": 1.979626006239602e-05, "loss": 0.0024, "num_tokens": 14285470.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.31636229477956096, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.057390640024095774, "learning_rate": 1.979561291361661e-05, "loss": 0.0023, "num_tokens": 14296220.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.31654676258992803, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.07466377504169941, "learning_rate": 1.9794964749293203e-05, "loss": 0.003, "num_tokens": 14303702.0, "reward": 1.9166667461395264, "reward_std": 0.15430331230163574, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 297.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.31673123040029516, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.042671683710068464, "learning_rate": 1.9794315569492982e-05, "loss": 0.0017, "num_tokens": 14309856.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.31691569821066223, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.03824770194478333, "learning_rate": 1.979366537428326e-05, "loss": 0.0015, "num_tokens": 14317513.0, "reward": 1.25, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3171001660210293, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.02705427212640643, "learning_rate": 1.979301416373144e-05, "loss": 0.0011, "num_tokens": 14321532.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 676.875, "completions/mean_terminated_length": 676.875, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.31728463383139643, "frac_reward_zero_std": 1.0, "grad_norm": 0.029052734375, "kl": 0.026389537379145622, "learning_rate": 1.9792361937905038e-05, "loss": 0.0011, "num_tokens": 14338331.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3174691016417635, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.09620907926000655, "learning_rate": 1.979170869687167e-05, "loss": 0.0038, "num_tokens": 14344643.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3176535694521306, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.10019650589674711, "learning_rate": 1.9791054440699057e-05, "loss": 0.004, "num_tokens": 14354152.0, "reward": 1.446428656578064, "reward_std": 0.5529538989067078, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.3740878105163574, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3178380372624977, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.0830680918879807, "learning_rate": 1.9790399169455033e-05, "loss": 0.0033, "num_tokens": 14361228.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 454.625, "completions/mean_terminated_length": 454.625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.3180225050728648, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.042373246513307095, "learning_rate": 1.9789742883207532e-05, "loss": 0.0017, "num_tokens": 14370761.0, "reward": 1.3318965435028076, "reward_std": 0.13410648703575134, "rewards/fixed_code_pass_all_test_reward/mean": 0.3318965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.13410645723342896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.31820697288323185, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.07158723333850503, "learning_rate": 1.978908558202459e-05, "loss": 0.0029, "num_tokens": 14378992.0, "reward": 1.688596487045288, "reward_std": 0.30538409948349, "rewards/fixed_code_pass_all_test_reward/mean": 0.6885964870452881, "rewards/fixed_code_pass_all_test_reward/std": 0.3053841292858124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.318391440693599, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.10440605506300926, "learning_rate": 1.9788427265974355e-05, "loss": 0.0042, "num_tokens": 14385741.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 219.5, "completions/mean_terminated_length": 219.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.31857590850396605, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.12287145247682929, "learning_rate": 1.9787767935125072e-05, "loss": 0.0049, "num_tokens": 14396905.0, "reward": 1.839962124824524, "reward_std": 0.19996216893196106, "rewards/fixed_code_pass_all_test_reward/mean": 0.8399621248245239, "rewards/fixed_code_pass_all_test_reward/std": 0.19996218383312225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.3187603763143331, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.06132039101794362, "learning_rate": 1.97871075895451e-05, "loss": 0.0025, "num_tokens": 14403200.0, "reward": 1.8690476417541504, "reward_std": 0.2935435473918915, "rewards/fixed_code_pass_all_test_reward/mean": 0.8690475821495056, "rewards/fixed_code_pass_all_test_reward/std": 0.2935435175895691, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.31894484412470026, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.09117587888613343, "learning_rate": 1.97864462293029e-05, "loss": 0.0036, "num_tokens": 14411833.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 495.25, "completions/mean_terminated_length": 495.25, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.31912931193506733, "frac_reward_zero_std": 1.0, "grad_norm": 0.3203125, "kl": 0.06196676520630717, "learning_rate": 1.9785783854467037e-05, "loss": 0.0025, "num_tokens": 14428219.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 367.875, "completions/mean_terminated_length": 367.875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.3193137797454344, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04813338629901409, "learning_rate": 1.978512046510618e-05, "loss": 0.0019, "num_tokens": 14436666.0, "reward": 1.21875, "reward_std": 0.4712729752063751, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.31949824755580153, "frac_reward_zero_std": 1.0, "grad_norm": 0.40234375, "kl": 0.1605838891118765, "learning_rate": 1.9784456061289105e-05, "loss": 0.0064, "num_tokens": 14445188.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.3196827153661686, "frac_reward_zero_std": 1.0, "grad_norm": 0.2578125, "kl": 0.10109540540724993, "learning_rate": 1.9783790643084694e-05, "loss": 0.004, "num_tokens": 14449262.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1377.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 594.75, "completions/mean_terminated_length": 594.75, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.3198671831765357, "frac_reward_zero_std": 0.0, "grad_norm": 0.609375, "kl": 0.04369717533700168, "learning_rate": 1.978312421056193e-05, "loss": 0.0017, "num_tokens": 14467316.0, "reward": 1.8645833730697632, "reward_std": 0.3505593538284302, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.35055938363075256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.3200516509869028, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.08497690130025148, "learning_rate": 1.978245676378991e-05, "loss": 0.0034, "num_tokens": 14473918.0, "reward": 1.8333333730697632, "reward_std": 0.35634833574295044, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3202361187972699, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.08126162411645055, "learning_rate": 1.9781788302837824e-05, "loss": 0.0032, "num_tokens": 14478598.0, "reward": 1.9791666269302368, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.32042058660763695, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.08717028517276049, "learning_rate": 1.9781118827774978e-05, "loss": 0.0035, "num_tokens": 14489008.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3206050544180041, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.04792699869722128, "learning_rate": 1.9780448338670775e-05, "loss": 0.0019, "num_tokens": 14495631.0, "reward": 1.7678570747375488, "reward_std": 0.43153735995292664, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.43153735995292664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.32078952222837115, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.05925857205875218, "learning_rate": 1.9779776835594734e-05, "loss": 0.0024, "num_tokens": 14503556.0, "reward": 1.6958333253860474, "reward_std": 0.4314749240875244, "rewards/fixed_code_pass_all_test_reward/mean": 0.6958333253860474, "rewards/fixed_code_pass_all_test_reward/std": 0.4314749836921692, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3209739900387382, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.12323627155274153, "learning_rate": 1.9779104318616464e-05, "loss": 0.0049, "num_tokens": 14508860.0, "reward": 1.5541666746139526, "reward_std": 0.33990541100502014, "rewards/fixed_code_pass_all_test_reward/mean": 0.5541666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.33990544080734253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.32115845784910535, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0983538986183703, "learning_rate": 1.9778430787805692e-05, "loss": 0.0039, "num_tokens": 14516800.0, "reward": 1.9812500476837158, "reward_std": 0.0530330166220665, "rewards/fixed_code_pass_all_test_reward/mean": 0.981249988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.053033001720905304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.3213429256594724, "frac_reward_zero_std": 1.0, "grad_norm": 0.1865234375, "kl": 0.07698617875576019, "learning_rate": 1.977775624323224e-05, "loss": 0.0031, "num_tokens": 14521807.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.3215273934698395, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.03197456058114767, "learning_rate": 1.977708068496605e-05, "loss": 0.0013, "num_tokens": 14526871.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.3217118612802066, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.06734870001673698, "learning_rate": 1.977640411307715e-05, "loss": 0.0027, "num_tokens": 14532160.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 435.625, "completions/mean_terminated_length": 205.2857208251953, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3218963290905737, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.08839738339884207, "learning_rate": 1.9775726527635687e-05, "loss": 0.0035, "num_tokens": 14539581.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.32208079690094077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.06652000150643289, "learning_rate": 1.9775047928711906e-05, "loss": 0.0027, "num_tokens": 14543747.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3222652647113079, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.10362414317205548, "learning_rate": 1.977436831637616e-05, "loss": 0.0041, "num_tokens": 14552860.0, "reward": 1.0762712955474854, "reward_std": 0.04707559198141098, "rewards/fixed_code_pass_all_test_reward/mean": 0.0762711837887764, "rewards/fixed_code_pass_all_test_reward/std": 0.04707559570670128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.32244973252167497, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.07780564576387405, "learning_rate": 1.977368769069891e-05, "loss": 0.0031, "num_tokens": 14557097.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.32263420033204204, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.09352086530998349, "learning_rate": 1.9773006051750716e-05, "loss": 0.0037, "num_tokens": 14561118.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.32281866814240917, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.10218691546469927, "learning_rate": 1.9772323399602243e-05, "loss": 0.0041, "num_tokens": 14568567.0, "reward": 1.7010868787765503, "reward_std": 0.2475235015153885, "rewards/fixed_code_pass_all_test_reward/mean": 0.7010869979858398, "rewards/fixed_code_pass_all_test_reward/std": 0.24752351641654968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.32300313595277624, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.07289698161184788, "learning_rate": 1.9771639734324272e-05, "loss": 0.0029, "num_tokens": 14574947.0, "reward": 1.4479166269302368, "reward_std": 0.23543904721736908, "rewards/fixed_code_pass_all_test_reward/mean": 0.4479166567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.23543907701969147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3231876037631433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.057445792481303215, "learning_rate": 1.9770955055987673e-05, "loss": 0.0023, "num_tokens": 14584423.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.32337207157351044, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.07450989726930857, "learning_rate": 1.9770269364663433e-05, "loss": 0.003, "num_tokens": 14596981.0, "reward": 1.034482717514038, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.03448275849223137, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3235565393838775, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.11056117340922356, "learning_rate": 1.9769582660422636e-05, "loss": 0.0044, "num_tokens": 14604300.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.3237410071942446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.14394644927233458, "learning_rate": 1.976889494333648e-05, "loss": 0.0058, "num_tokens": 14611061.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3239254750046117, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.06253304937854409, "learning_rate": 1.9768206213476258e-05, "loss": 0.0025, "num_tokens": 14617337.0, "reward": 1.53125, "reward_std": 0.6838376522064209, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.39387044310569763, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.3241099428149788, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.09701111260801554, "learning_rate": 1.976751647091338e-05, "loss": 0.0039, "num_tokens": 14628457.0, "reward": 1.7740384340286255, "reward_std": 0.2652272582054138, "rewards/fixed_code_pass_all_test_reward/mean": 0.7740384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.2652272880077362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.32429441062534586, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.04948270693421364, "learning_rate": 1.9766825715719347e-05, "loss": 0.002, "num_tokens": 14636495.0, "reward": 1.5961538553237915, "reward_std": 0.17804235219955444, "rewards/fixed_code_pass_all_test_reward/mean": 0.5961538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.17804233729839325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.324478878435713, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.06723115220665932, "learning_rate": 1.9766133947965773e-05, "loss": 0.0027, "num_tokens": 14641021.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.32466334624608006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.0807058997452259, "learning_rate": 1.9765441167724376e-05, "loss": 0.0032, "num_tokens": 14647569.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.32484781405644714, "frac_reward_zero_std": 0.0, "grad_norm": 4.15625, "kl": 0.13152326829731464, "learning_rate": 1.9764747375066984e-05, "loss": 0.0053, "num_tokens": 14654335.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.32503228186681427, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.10068637505173683, "learning_rate": 1.9764052570065518e-05, "loss": 0.004, "num_tokens": 14658550.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.32521674967718134, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.06980323418974876, "learning_rate": 1.9763356752792015e-05, "loss": 0.0028, "num_tokens": 14665270.0, "reward": 1.6682692766189575, "reward_std": 0.21059979498386383, "rewards/fixed_code_pass_all_test_reward/mean": 0.6682692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.21059982478618622, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3254012174875484, "frac_reward_zero_std": 1.0, "grad_norm": 0.2177734375, "kl": 0.09653685847297311, "learning_rate": 1.9762659923318612e-05, "loss": 0.0039, "num_tokens": 14674503.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.32558568529791554, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.08440969977527857, "learning_rate": 1.976196208171755e-05, "loss": 0.0034, "num_tokens": 14685618.0, "reward": 1.575657844543457, "reward_std": 0.4741930067539215, "rewards/fixed_code_pass_all_test_reward/mean": 0.7006579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.43056395649909973, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3257701531082826, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.11532928328961134, "learning_rate": 1.9761263228061177e-05, "loss": 0.0046, "num_tokens": 14695811.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3259546209186497, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.08350975532084703, "learning_rate": 1.9760563362421946e-05, "loss": 0.0033, "num_tokens": 14703234.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3261390887290168, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.07726488402113318, "learning_rate": 1.9759862484872416e-05, "loss": 0.0031, "num_tokens": 14715601.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3263235565393839, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.07589742448180914, "learning_rate": 1.9759160595485246e-05, "loss": 0.003, "num_tokens": 14722644.0, "reward": 1.5416666269302368, "reward_std": 0.494011789560318, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.4940117597579956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.32650802434975096, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.06759870890527964, "learning_rate": 1.9758457694333205e-05, "loss": 0.0027, "num_tokens": 14733911.0, "reward": 1.08695650100708, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.08695652335882187, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3266924921601181, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.07559532998129725, "learning_rate": 1.975775378148917e-05, "loss": 0.003, "num_tokens": 14743070.0, "reward": 1.4241071939468384, "reward_std": 0.12725728750228882, "rewards/fixed_code_pass_all_test_reward/mean": 0.4241071343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.1272573173046112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.32687695997048516, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.06814621575176716, "learning_rate": 1.9757048857026105e-05, "loss": 0.0027, "num_tokens": 14748457.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.32706142778085223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.0432502212934196, "learning_rate": 1.9756342921017105e-05, "loss": 0.0017, "num_tokens": 14752979.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3272458955912193, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.036549957701936364, "learning_rate": 1.975563597353535e-05, "loss": 0.0015, "num_tokens": 14758408.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 501.25, "completions/mean_terminated_length": 501.25, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.32743036340158643, "frac_reward_zero_std": 1.0, "grad_norm": 0.61328125, "kl": 0.11061828769743443, "learning_rate": 1.9754928014654134e-05, "loss": 0.0044, "num_tokens": 14768354.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3276148312119535, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.10955561697483063, "learning_rate": 1.9754219044446855e-05, "loss": 0.0044, "num_tokens": 14776715.0, "reward": 1.9895832538604736, "reward_std": 0.01928795501589775, "rewards/fixed_code_pass_all_test_reward/mean": 0.9895833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.019287927076220512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3277992990223206, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.06415437115356326, "learning_rate": 1.9753509062987008e-05, "loss": 0.0026, "num_tokens": 14783779.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 429.625, "completions/mean_terminated_length": 429.625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.3279837668326877, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.06184139382094145, "learning_rate": 1.9752798070348203e-05, "loss": 0.0025, "num_tokens": 14794976.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 361.875, "completions/mean_terminated_length": 361.875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.3281682346430548, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.04192975303158164, "learning_rate": 1.9752086066604157e-05, "loss": 0.0017, "num_tokens": 14802799.0, "reward": 1.1818182468414307, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.32835270245342185, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03831974393688142, "learning_rate": 1.9751373051828674e-05, "loss": 0.0015, "num_tokens": 14808329.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 433.375, "completions/mean_terminated_length": 433.375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.328537170263789, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.07447442412376404, "learning_rate": 1.9750659026095684e-05, "loss": 0.003, "num_tokens": 14817148.0, "reward": 1.875, "reward_std": 0.13363061845302582, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.32872163807415605, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.07274224190041423, "learning_rate": 1.9749943989479206e-05, "loss": 0.0029, "num_tokens": 14824733.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.3289061058845231, "frac_reward_zero_std": 1.0, "grad_norm": 0.220703125, "kl": 0.14287777710705996, "learning_rate": 1.974922794205338e-05, "loss": 0.0057, "num_tokens": 14830736.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 434.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.32909057369489025, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.07484631799161434, "learning_rate": 1.974851088389243e-05, "loss": 0.003, "num_tokens": 14838612.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 368.125, "completions/mean_terminated_length": 368.125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.3292750415052573, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.06566274212673306, "learning_rate": 1.97477928150707e-05, "loss": 0.0026, "num_tokens": 14849221.0, "reward": 1.845070481300354, "reward_std": 0.13039718568325043, "rewards/fixed_code_pass_all_test_reward/mean": 0.8450704216957092, "rewards/fixed_code_pass_all_test_reward/std": 0.13039720058441162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.3294595093156244, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.10184938367456198, "learning_rate": 1.9747073735662635e-05, "loss": 0.0041, "num_tokens": 14858932.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.3296439771259915, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.09796668263152242, "learning_rate": 1.9746353645742787e-05, "loss": 0.0039, "num_tokens": 14865335.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 374.625, "completions/mean_terminated_length": 374.625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.3298284449363586, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.04449487570673227, "learning_rate": 1.9745632545385806e-05, "loss": 0.0018, "num_tokens": 14873028.0, "reward": 1.45652174949646, "reward_std": 0.7267876863479614, "rewards/fixed_code_pass_all_test_reward/mean": 0.58152174949646, "rewards/fixed_code_pass_all_test_reward/std": 0.4868996739387512, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 720.625, "completions/mean_terminated_length": 720.625, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.3300129127467257, "frac_reward_zero_std": 0.0, "grad_norm": 0.5625, "kl": 0.03969099558889866, "learning_rate": 1.9744910434666448e-05, "loss": 0.0016, "num_tokens": 14886681.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3301973805570928, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0898564518429339, "learning_rate": 1.9744187313659584e-05, "loss": 0.0036, "num_tokens": 14895284.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 768.875, "completions/mean_terminated_length": 586.1428833007812, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.3303818483674599, "frac_reward_zero_std": 0.0, "grad_norm": 0.63671875, "kl": 0.02595028094947338, "learning_rate": 1.974346318244018e-05, "loss": 0.001, "num_tokens": 14909691.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.33056631617782695, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.046057891100645065, "learning_rate": 1.9742738041083308e-05, "loss": 0.0018, "num_tokens": 14914487.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 392.375, "completions/mean_terminated_length": 392.375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.3307507839881941, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.07713411701843143, "learning_rate": 1.9742011889664144e-05, "loss": 0.0031, "num_tokens": 14924410.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 367.75, "completions/mean_terminated_length": 367.75, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.33093525179856115, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.04441390396095812, "learning_rate": 1.9741284728257976e-05, "loss": 0.0018, "num_tokens": 14934248.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 586.875, "completions/mean_terminated_length": 378.14288330078125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.3311197196089282, "frac_reward_zero_std": 0.0, "grad_norm": 0.7265625, "kl": 0.058024664875119925, "learning_rate": 1.9740556556940187e-05, "loss": 0.0023, "num_tokens": 14943599.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.33130418741929535, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.06734179286286235, "learning_rate": 1.9739827375786273e-05, "loss": 0.0027, "num_tokens": 14952592.0, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3314886552296624, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.05632603308185935, "learning_rate": 1.9739097184871823e-05, "loss": 0.0023, "num_tokens": 14958765.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 500.125, "completions/mean_terminated_length": 500.125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.3316731230400295, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.03952726791612804, "learning_rate": 1.9738365984272544e-05, "loss": 0.0016, "num_tokens": 14968414.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 398.25, "completions/mean_terminated_length": 398.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.3318575908503966, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.05610333103686571, "learning_rate": 1.973763377406424e-05, "loss": 0.0022, "num_tokens": 14978496.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 485.875, "completions/mean_terminated_length": 485.875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.3320420586607637, "frac_reward_zero_std": 0.0, "grad_norm": 0.671875, "kl": 0.025123853236436844, "learning_rate": 1.9736900554322824e-05, "loss": 0.001, "num_tokens": 14992063.0, "reward": 1.128151297569275, "reward_std": 0.23729005455970764, "rewards/fixed_code_pass_all_test_reward/mean": 0.12815126776695251, "rewards/fixed_code_pass_all_test_reward/std": 0.23729003965854645, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 585.0, "completions/mean_terminated_length": 585.0, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.33222652647113077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.039046943886205554, "learning_rate": 1.973616632512431e-05, "loss": 0.0016, "num_tokens": 15002527.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3324109942814979, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.07029687252361327, "learning_rate": 1.9735431086544818e-05, "loss": 0.0028, "num_tokens": 15012271.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.33259546209186497, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.10851633409038186, "learning_rate": 1.973469483866057e-05, "loss": 0.0043, "num_tokens": 15022867.0, "reward": 1.3935811519622803, "reward_std": 0.1874489039182663, "rewards/fixed_code_pass_all_test_reward/mean": 0.3935810923576355, "rewards/fixed_code_pass_all_test_reward/std": 0.1874489188194275, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.33277992990223204, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.04353495198301971, "learning_rate": 1.9733957581547905e-05, "loss": 0.0017, "num_tokens": 15028240.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.33296439771259917, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.03335838089697063, "learning_rate": 1.9733219315283245e-05, "loss": 0.0013, "num_tokens": 15034678.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.33314886552296624, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.10374193405732512, "learning_rate": 1.9732480039943133e-05, "loss": 0.0041, "num_tokens": 15038603.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.08150022150948644, "learning_rate": 1.9731739755604217e-05, "loss": 0.0033, "num_tokens": 15043818.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 483.125, "completions/mean_terminated_length": 483.125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.33351780114370044, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.04728736914694309, "learning_rate": 1.9730998462343237e-05, "loss": 0.0019, "num_tokens": 15054699.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3337022689540675, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.04933475377038121, "learning_rate": 1.9730256160237047e-05, "loss": 0.002, "num_tokens": 15060906.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3338867367644346, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.05992202879860997, "learning_rate": 1.9729512849362607e-05, "loss": 0.0024, "num_tokens": 15070060.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 384.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3340712045748017, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.05250877724029124, "learning_rate": 1.9728768529796976e-05, "loss": 0.0021, "num_tokens": 15077599.0, "reward": 1.65816330909729, "reward_std": 0.21973492205142975, "rewards/fixed_code_pass_all_test_reward/mean": 0.6581632494926453, "rewards/fixed_code_pass_all_test_reward/std": 0.21973496675491333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3342556723851688, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.09539045346900821, "learning_rate": 1.9728023201617326e-05, "loss": 0.0038, "num_tokens": 15086804.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 433.0, "completions/mean_terminated_length": 433.0, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.33444014019553586, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.06113663408905268, "learning_rate": 1.972727686490092e-05, "loss": 0.0024, "num_tokens": 15102708.0, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 382.875, "completions/mean_terminated_length": 382.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.334624608005903, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04856591857969761, "learning_rate": 1.9726529519725136e-05, "loss": 0.0019, "num_tokens": 15113051.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 356.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.33480907581627006, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.03870244463905692, "learning_rate": 1.9725781166167452e-05, "loss": 0.0015, "num_tokens": 15119550.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.33499354362663714, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.04484340222552419, "learning_rate": 1.9725031804305454e-05, "loss": 0.0018, "num_tokens": 15128954.0, "reward": 1.7799999713897705, "reward_std": 0.40736088156700134, "rewards/fixed_code_pass_all_test_reward/mean": 0.7799999713897705, "rewards/fixed_code_pass_all_test_reward/std": 0.40736085176467896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.33517801143700426, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.06208996404893696, "learning_rate": 1.9724281434216836e-05, "loss": 0.0025, "num_tokens": 15132991.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 535.5, "completions/mean_terminated_length": 535.5, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.33536247924737134, "frac_reward_zero_std": 0.0, "grad_norm": 0.609375, "kl": 0.029938201420009136, "learning_rate": 1.972353005597938e-05, "loss": 0.0012, "num_tokens": 15143627.0, "reward": 1.3928571939468384, "reward_std": 0.2753211557865143, "rewards/fixed_code_pass_all_test_reward/mean": 0.392857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.27532118558883667, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3355469470577384, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06490082759410143, "learning_rate": 1.9722777669670995e-05, "loss": 0.0026, "num_tokens": 15153150.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 261.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.33573141486810554, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.10711404168978333, "learning_rate": 1.9722024275369677e-05, "loss": 0.0043, "num_tokens": 15158097.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 450.25, "completions/mean_terminated_length": 450.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3359158826784726, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.0272505545290187, "learning_rate": 1.9721269873153535e-05, "loss": 0.0011, "num_tokens": 15166827.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 255.25, "completions/mean_terminated_length": 255.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3361003504888397, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.05219823378138244, "learning_rate": 1.9720514463100783e-05, "loss": 0.0021, "num_tokens": 15178613.0, "reward": 1.495192289352417, "reward_std": 0.20397312939167023, "rewards/fixed_code_pass_all_test_reward/mean": 0.4951923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.20397312939167023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3362848182992068, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.06762572703883052, "learning_rate": 1.971975804528973e-05, "loss": 0.0027, "num_tokens": 15183159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3364692861095739, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.07368692522868514, "learning_rate": 1.9719000619798804e-05, "loss": 0.0029, "num_tokens": 15187852.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.33665375391994096, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.07078687148168683, "learning_rate": 1.971824218670652e-05, "loss": 0.0028, "num_tokens": 15197614.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 417.375, "completions/mean_terminated_length": 417.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3368382217303081, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.07483380509074777, "learning_rate": 1.971748274609152e-05, "loss": 0.003, "num_tokens": 15207105.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.33702268954067516, "frac_reward_zero_std": 1.0, "grad_norm": 0.1572265625, "kl": 0.08799533522687852, "learning_rate": 1.9716722298032528e-05, "loss": 0.0035, "num_tokens": 15212757.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.33720715735104223, "frac_reward_zero_std": 1.0, "grad_norm": 0.271484375, "kl": 0.07945042056962848, "learning_rate": 1.9715960842608385e-05, "loss": 0.0032, "num_tokens": 15216615.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 442.125, "completions/mean_terminated_length": 442.125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.33739162516140936, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.01196912198793143, "learning_rate": 1.9715198379898036e-05, "loss": 0.0005, "num_tokens": 15225192.0, "reward": 1.1500000953674316, "reward_std": 0.053452279418706894, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.053452249616384506, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.33757609297177643, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.03773574670776725, "learning_rate": 1.9714434909980524e-05, "loss": 0.0015, "num_tokens": 15229638.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 457.375, "completions/mean_terminated_length": 457.375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.3377605607821435, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.02060489682480693, "learning_rate": 1.9713670432935008e-05, "loss": 0.0008, "num_tokens": 15238233.0, "reward": 1.875, "reward_std": 0.2082483023405075, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2082482874393463, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.33794502859251063, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.05617201211862266, "learning_rate": 1.971290494884073e-05, "loss": 0.0022, "num_tokens": 15248348.0, "reward": 1.7750000953674316, "reward_std": 0.310529500246048, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.3381294964028777, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.06647268310189247, "learning_rate": 1.9712138457777067e-05, "loss": 0.0027, "num_tokens": 15257724.0, "reward": 1.5592105388641357, "reward_std": 0.38669759035110474, "rewards/fixed_code_pass_all_test_reward/mean": 0.5592105388641357, "rewards/fixed_code_pass_all_test_reward/std": 0.38669759035110474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3383139642132448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.021635653014527634, "learning_rate": 1.9711370959823472e-05, "loss": 0.0009, "num_tokens": 15264471.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.3384984320236119, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.06669621355831623, "learning_rate": 1.9710602455059516e-05, "loss": 0.0027, "num_tokens": 15275620.0, "reward": 1.2727272510528564, "reward_std": 0.3331364691257477, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.33313652873039246, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 489.375, "completions/mean_terminated_length": 266.71429443359375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.338682899833979, "frac_reward_zero_std": 0.0, "grad_norm": 0.53125, "kl": 0.07141411933116615, "learning_rate": 1.9709832943564874e-05, "loss": 0.0029, "num_tokens": 15287327.0, "reward": 1.5499999523162842, "reward_std": 0.6406718492507935, "rewards/fixed_code_pass_all_test_reward/mean": 0.6749999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.3043079078197479, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.33886736764434605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.036501258378848433, "learning_rate": 1.9709062425419326e-05, "loss": 0.0015, "num_tokens": 15293435.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 579.25, "completions/mean_terminated_length": 579.25, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.3390518354547132, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.04275131202302873, "learning_rate": 1.9708290900702752e-05, "loss": 0.0017, "num_tokens": 15307453.0, "reward": 1.4711538553237915, "reward_std": 0.467791885137558, "rewards/fixed_code_pass_all_test_reward/mean": 0.4711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.46779191493988037, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.33923630326508025, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.08385665202513337, "learning_rate": 1.9707518369495138e-05, "loss": 0.0034, "num_tokens": 15315083.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 498.625, "completions/mean_terminated_length": 498.625, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.3394207710754473, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.05743088014423847, "learning_rate": 1.9706744831876576e-05, "loss": 0.0023, "num_tokens": 15323984.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3396052388858144, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.0571156470105052, "learning_rate": 1.970597028792726e-05, "loss": 0.0023, "num_tokens": 15330033.0, "reward": 1.6101974248886108, "reward_std": 0.22450506687164307, "rewards/fixed_code_pass_all_test_reward/mean": 0.6101974248886108, "rewards/fixed_code_pass_all_test_reward/std": 0.22450508177280426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 710.5, "completions/mean_terminated_length": 710.5, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.3397897066961815, "frac_reward_zero_std": 0.0, "grad_norm": 0.71875, "kl": 0.03120402479544282, "learning_rate": 1.9705194737727492e-05, "loss": 0.0012, "num_tokens": 15342869.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3399741745065486, "frac_reward_zero_std": 1.0, "grad_norm": 0.30078125, "kl": 0.08945373119786382, "learning_rate": 1.9704418181357675e-05, "loss": 0.0036, "num_tokens": 15347114.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.34015864231691567, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.05138180451467633, "learning_rate": 1.9703640618898313e-05, "loss": 0.0021, "num_tokens": 15351717.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3403431101272828, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04971617856062949, "learning_rate": 1.9702862050430024e-05, "loss": 0.002, "num_tokens": 15359792.0, "reward": 1.21875, "reward_std": 0.23385359346866608, "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, "rewards/fixed_code_pass_all_test_reward/std": 0.23385359346866608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3405275779376499, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.07072699163109064, "learning_rate": 1.9702082476033522e-05, "loss": 0.0028, "num_tokens": 15367709.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 242.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.34071204574801695, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.03315013169776648, "learning_rate": 1.9701301895789627e-05, "loss": 0.0013, "num_tokens": 15373210.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 377.75, "completions/mean_terminated_length": 377.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3408965135583841, "frac_reward_zero_std": 1.0, "grad_norm": 0.62890625, "kl": 0.0798835544846952, "learning_rate": 1.9700520309779268e-05, "loss": 0.0032, "num_tokens": 15381600.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.34108098136875115, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.06389265088364482, "learning_rate": 1.9699737718083475e-05, "loss": 0.0026, "num_tokens": 15390767.0, "reward": 1.7190594673156738, "reward_std": 0.40474647283554077, "rewards/fixed_code_pass_all_test_reward/mean": 0.719059407711029, "rewards/fixed_code_pass_all_test_reward/std": 0.40474650263786316, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.3412654491791182, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.03468956728465855, "learning_rate": 1.9698954120783378e-05, "loss": 0.0014, "num_tokens": 15394609.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 721.125, "completions/mean_terminated_length": 721.125, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.34144991698948535, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.040311090648174286, "learning_rate": 1.9698169517960216e-05, "loss": 0.0016, "num_tokens": 15412226.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3416343847998524, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.061570935882627964, "learning_rate": 1.9697383909695332e-05, "loss": 0.0025, "num_tokens": 15420096.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.3418188526102195, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.03358614759054035, "learning_rate": 1.9696597296070177e-05, "loss": 0.0013, "num_tokens": 15425860.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.3420033204205866, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.06291339313611388, "learning_rate": 1.9695809677166294e-05, "loss": 0.0025, "num_tokens": 15436938.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 410.25, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3421877882309537, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.02087060033227317, "learning_rate": 1.969502105306534e-05, "loss": 0.0008, "num_tokens": 15445556.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.34237225604132077, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.047828986775130033, "learning_rate": 1.9694231423849083e-05, "loss": 0.0019, "num_tokens": 15453219.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3425567238516879, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.07386017125099897, "learning_rate": 1.9693440789599373e-05, "loss": 0.003, "num_tokens": 15459350.0, "reward": 1.5586735010147095, "reward_std": 0.35565075278282166, "rewards/fixed_code_pass_all_test_reward/mean": 0.5586735010147095, "rewards/fixed_code_pass_all_test_reward/std": 0.35565078258514404, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.34274119166205497, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.07171640545129776, "learning_rate": 1.969264915039819e-05, "loss": 0.0029, "num_tokens": 15466491.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.34292565947242204, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.05277696577832103, "learning_rate": 1.9691856506327595e-05, "loss": 0.0021, "num_tokens": 15475882.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.34311012728278917, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.07378734485246241, "learning_rate": 1.9691062857469773e-05, "loss": 0.003, "num_tokens": 15480139.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.34329459509315624, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.06306059774942696, "learning_rate": 1.9690268203907e-05, "loss": 0.0025, "num_tokens": 15486311.0, "reward": 1.625, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3434790629035233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.05596954328939319, "learning_rate": 1.968947254572166e-05, "loss": 0.0022, "num_tokens": 15491430.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.34366353071389044, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.08689720742404461, "learning_rate": 1.9688675882996243e-05, "loss": 0.0035, "num_tokens": 15495660.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 545.875, "completions/mean_terminated_length": 545.875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.3438479985242575, "frac_reward_zero_std": 1.0, "grad_norm": 0.251953125, "kl": 0.03413332987111062, "learning_rate": 1.968787821581334e-05, "loss": 0.0014, "num_tokens": 15503987.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3440324663346246, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.05155676091089845, "learning_rate": 1.9687079544255653e-05, "loss": 0.0021, "num_tokens": 15508773.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 342.25, "completions/mean_terminated_length": 342.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3442169341449917, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.05855845287442207, "learning_rate": 1.9686279868405975e-05, "loss": 0.0023, "num_tokens": 15515951.0, "reward": 1.3691861629486084, "reward_std": 0.2548873722553253, "rewards/fixed_code_pass_all_test_reward/mean": 0.36918604373931885, "rewards/fixed_code_pass_all_test_reward/std": 0.25488734245300293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.3444014019553588, "frac_reward_zero_std": 1.0, "grad_norm": 0.5546875, "kl": 0.09417074674274772, "learning_rate": 1.9685479188347214e-05, "loss": 0.0038, "num_tokens": 15524502.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.34458586976572586, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.058050604071468115, "learning_rate": 1.9684677504162384e-05, "loss": 0.0023, "num_tokens": 15534222.0, "reward": 1.8607594966888428, "reward_std": 0.19316022098064423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8607594966888428, "rewards/fixed_code_pass_all_test_reward/std": 0.19316023588180542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 497.5, "completions/mean_terminated_length": 497.5, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.344770337576093, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.04016149416565895, "learning_rate": 1.9683874815934595e-05, "loss": 0.0016, "num_tokens": 15544026.0, "reward": 1.2300000190734863, "reward_std": 0.26251524686813354, "rewards/fixed_code_pass_all_test_reward/mean": 0.23000000417232513, "rewards/fixed_code_pass_all_test_reward/std": 0.2625153064727783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.34495480538646006, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.04399755736812949, "learning_rate": 1.968307112374706e-05, "loss": 0.0018, "num_tokens": 15553175.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 475.0, "completions/mean_terminated_length": 475.0, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.34513927319682713, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.0354848129209131, "learning_rate": 1.9682266427683107e-05, "loss": 0.0014, "num_tokens": 15562503.0, "reward": 1.6527777910232544, "reward_std": 0.4840944707393646, "rewards/fixed_code_pass_all_test_reward/mean": 0.6527777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.4840944707393646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.34532374100719426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.03553930693306029, "learning_rate": 1.968146072782616e-05, "loss": 0.0014, "num_tokens": 15572017.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.34550820881756134, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.048338708467781544, "learning_rate": 1.9680654024259746e-05, "loss": 0.0019, "num_tokens": 15582894.0, "reward": 1.7000000476837158, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.699999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.3456926766279284, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.07934860605746508, "learning_rate": 1.9679846317067502e-05, "loss": 0.0032, "num_tokens": 15591386.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.34587714443829554, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.05576801672577858, "learning_rate": 1.967903760633316e-05, "loss": 0.0022, "num_tokens": 15599439.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.3460616122486626, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.04841893323464319, "learning_rate": 1.967822789214057e-05, "loss": 0.0019, "num_tokens": 15608273.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 609.5, "completions/mean_terminated_length": 609.5, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.3462460800590297, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.03979158797301352, "learning_rate": 1.967741717457367e-05, "loss": 0.0016, "num_tokens": 15623437.0, "reward": 1.2848360538482666, "reward_std": 0.24108274281024933, "rewards/fixed_code_pass_all_test_reward/mean": 0.2848360538482666, "rewards/fixed_code_pass_all_test_reward/std": 0.24108275771141052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 376.875, "completions/mean_terminated_length": 376.875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.3464305478693968, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.02918151766061783, "learning_rate": 1.9676605453716516e-05, "loss": 0.0012, "num_tokens": 15631236.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3466150156797639, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.041618830524384975, "learning_rate": 1.9675792729653256e-05, "loss": 0.0017, "num_tokens": 15635515.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 328.875, "completions/mean_terminated_length": 328.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.34679948349013096, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.03911396209150553, "learning_rate": 1.9674979002468152e-05, "loss": 0.0016, "num_tokens": 15645434.0, "reward": 1.5, "reward_std": 0.1781741827726364, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.1781741827726364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.3469839513004981, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.0460129554849118, "learning_rate": 1.9674164272245565e-05, "loss": 0.0018, "num_tokens": 15656191.0, "reward": 1.6875, "reward_std": 0.3260718882083893, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.32607191801071167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.34716841911086516, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.057686587795615196, "learning_rate": 1.967334853906996e-05, "loss": 0.0023, "num_tokens": 15665486.0, "reward": 1.7053571939468384, "reward_std": 0.29435279965400696, "rewards/fixed_code_pass_all_test_reward/mean": 0.7053571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.29435282945632935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 420.375, "completions/mean_terminated_length": 420.375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.34735288692123223, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.03906270093284547, "learning_rate": 1.9672531803025913e-05, "loss": 0.0016, "num_tokens": 15674153.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 342.375, "completions/mean_terminated_length": 342.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.34753735473159936, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.049105466809123755, "learning_rate": 1.9671714064198086e-05, "loss": 0.002, "num_tokens": 15687060.0, "reward": 1.2715516090393066, "reward_std": 0.44976547360420227, "rewards/fixed_code_pass_all_test_reward/mean": 0.3965517282485962, "rewards/fixed_code_pass_all_test_reward/std": 0.49970266222953796, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 655.75, "completions/mean_terminated_length": 655.75, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.34772182254196643, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.022958481684327126, "learning_rate": 1.9670895322671263e-05, "loss": 0.0009, "num_tokens": 15699162.0, "reward": 1.1458332538604736, "reward_std": 0.058925557881593704, "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.3479062903523335, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.05411185370758176, "learning_rate": 1.9670075578530325e-05, "loss": 0.0022, "num_tokens": 15703223.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.34809075816270063, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.06141763227060437, "learning_rate": 1.9669254831860262e-05, "loss": 0.0025, "num_tokens": 15711713.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.3482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0587551542557776, "learning_rate": 1.9668433082746157e-05, "loss": 0.0024, "num_tokens": 15717699.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 480.625, "completions/mean_terminated_length": 480.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.3484596937834348, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.04872559616342187, "learning_rate": 1.9667610331273205e-05, "loss": 0.0019, "num_tokens": 15730816.0, "reward": 1.4464285373687744, "reward_std": 0.14328168332576752, "rewards/fixed_code_pass_all_test_reward/mean": 0.4464285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.1432816982269287, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 468.0, "completions/mean_terminated_length": 468.0, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.3486441615938019, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.03919334663078189, "learning_rate": 1.966678657752671e-05, "loss": 0.0016, "num_tokens": 15743288.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.348828629404169, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.09010765142738819, "learning_rate": 1.966596182159206e-05, "loss": 0.0036, "num_tokens": 15747801.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.34901309721453605, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.06038056802935898, "learning_rate": 1.9665136063554774e-05, "loss": 0.0024, "num_tokens": 15755520.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3491975650249032, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.08439464960247278, "learning_rate": 1.9664309303500455e-05, "loss": 0.0034, "num_tokens": 15759507.0, "reward": 0.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5345224738121033, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.34938203283527025, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.07662152126431465, "learning_rate": 1.9663481541514814e-05, "loss": 0.0031, "num_tokens": 15770110.0, "reward": 1.5597827434539795, "reward_std": 0.22822393476963043, "rewards/fixed_code_pass_all_test_reward/mean": 0.5597826242446899, "rewards/fixed_code_pass_all_test_reward/std": 0.22822390496730804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 400.5, "completions/mean_terminated_length": 400.5, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.3495665006456373, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05332147283479571, "learning_rate": 1.966265277768367e-05, "loss": 0.0021, "num_tokens": 15777650.0, "reward": 1.7857143878936768, "reward_std": 0.24414090812206268, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.24414092302322388, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.34975096845600445, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.06096297223120928, "learning_rate": 1.9661823012092945e-05, "loss": 0.0024, "num_tokens": 15785546.0, "reward": 1.8250000476837158, "reward_std": 0.27645719051361084, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.27645716071128845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.3499354362663715, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.07233964698389173, "learning_rate": 1.966099224482866e-05, "loss": 0.0029, "num_tokens": 15793456.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3501199040767386, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.03652283735573292, "learning_rate": 1.966016047597695e-05, "loss": 0.0015, "num_tokens": 15799346.0, "reward": 1.7999999523162842, "reward_std": 0.38544961810112, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 626.25, "completions/mean_terminated_length": 626.25, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.3503043718871057, "frac_reward_zero_std": 1.0, "grad_norm": 0.05126953125, "kl": 0.038280142238363624, "learning_rate": 1.965932770562404e-05, "loss": 0.0015, "num_tokens": 15810212.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.3504888396974728, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.06370309367775917, "learning_rate": 1.965849393385627e-05, "loss": 0.0025, "num_tokens": 15819249.0, "reward": 1.9558823108673096, "reward_std": 0.12478352338075638, "rewards/fixed_code_pass_all_test_reward/mean": 0.9558823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.12478354573249817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 544.5, "completions/mean_terminated_length": 544.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.35067330750783987, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.043001665151678026, "learning_rate": 1.9657659160760078e-05, "loss": 0.0017, "num_tokens": 15829285.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.350857775318207, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.07267709542065859, "learning_rate": 1.9656823386422008e-05, "loss": 0.0029, "num_tokens": 15833628.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 581.0, "completions/mean_terminated_length": 371.4285888671875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.35104224312857407, "frac_reward_zero_std": 0.0, "grad_norm": 0.484375, "kl": 0.05674180522328243, "learning_rate": 1.965598661092871e-05, "loss": 0.0023, "num_tokens": 15843676.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 487.5, "completions/mean_terminated_length": 487.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.35122671093894114, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.05473588057793677, "learning_rate": 1.9655148834366934e-05, "loss": 0.0022, "num_tokens": 15855480.0, "reward": 1.8181817531585693, "reward_std": 0.2201213240623474, "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.22012126445770264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3514111787493083, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06024289480410516, "learning_rate": 1.9654310056823534e-05, "loss": 0.0024, "num_tokens": 15865017.0, "reward": 1.6470588445663452, "reward_std": 0.7252251505851746, "rewards/fixed_code_pass_all_test_reward/mean": 0.7720588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.4246920347213745, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.35159564655967535, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.03684038948267698, "learning_rate": 1.9653470278385468e-05, "loss": 0.0015, "num_tokens": 15869535.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3517801143700424, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.11445109848864377, "learning_rate": 1.96526294991398e-05, "loss": 0.0046, "num_tokens": 15875546.0, "reward": 1.1875, "reward_std": 0.4124789834022522, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.3519645821804095, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.06378674600273371, "learning_rate": 1.96517877191737e-05, "loss": 0.0026, "num_tokens": 15884478.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 769.75, "completions/mean_terminated_length": 769.75, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.3521490499907766, "frac_reward_zero_std": 0.0, "grad_norm": 0.427734375, "kl": 0.022394536761566997, "learning_rate": 1.9650944938574433e-05, "loss": 0.0009, "num_tokens": 15901252.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 410.75, "completions/mean_terminated_length": 410.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.3523335178011437, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.05032914981711656, "learning_rate": 1.9650101157429377e-05, "loss": 0.002, "num_tokens": 15909018.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.35251798561151076, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.045735320542007685, "learning_rate": 1.9649256375826003e-05, "loss": 0.0018, "num_tokens": 15916193.0, "reward": 1.8556034564971924, "reward_std": 0.31142908334732056, "rewards/fixed_code_pass_all_test_reward/mean": 0.8556034564971924, "rewards/fixed_code_pass_all_test_reward/std": 0.31142905354499817, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 438.125, "completions/mean_terminated_length": 438.125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.3527024534218779, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.06275790487416089, "learning_rate": 1.96484105938519e-05, "loss": 0.0025, "num_tokens": 15928626.0, "reward": 1.5178571939468384, "reward_std": 0.43153735995292664, "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.43153735995292664, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.35288692123224497, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.056699561420828104, "learning_rate": 1.964756381159475e-05, "loss": 0.0023, "num_tokens": 15934135.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 538.375, "completions/mean_terminated_length": 538.375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.35307138904261204, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.02800741884857416, "learning_rate": 1.9646716029142338e-05, "loss": 0.0011, "num_tokens": 15945954.0, "reward": 1.8576922416687012, "reward_std": 0.18493562936782837, "rewards/fixed_code_pass_all_test_reward/mean": 0.8576923608779907, "rewards/fixed_code_pass_all_test_reward/std": 0.18493562936782837, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 463.875, "completions/mean_terminated_length": 463.875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.35325585685297917, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.03093776188325137, "learning_rate": 1.964586724658256e-05, "loss": 0.0012, "num_tokens": 15953385.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 581.125, "completions/mean_terminated_length": 371.5714416503906, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.35344032466334624, "frac_reward_zero_std": 0.0, "grad_norm": 0.390625, "kl": 0.029448895656969398, "learning_rate": 1.9645017464003414e-05, "loss": 0.0012, "num_tokens": 15964418.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 611.0, "completions/mean_terminated_length": 405.71429443359375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3536247924737133, "frac_reward_zero_std": 0.0, "grad_norm": 0.578125, "kl": 0.04182275087805465, "learning_rate": 1.9644166681492997e-05, "loss": 0.0017, "num_tokens": 15975378.0, "reward": 1.375, "reward_std": 0.6648039817810059, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.35380926028408044, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.0175650320306886, "learning_rate": 1.9643314899139514e-05, "loss": 0.0007, "num_tokens": 15981093.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.3539937280944475, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05295420857146382, "learning_rate": 1.964246211703127e-05, "loss": 0.0021, "num_tokens": 15990639.0, "reward": 1.7093024253845215, "reward_std": 0.36049190163612366, "rewards/fixed_code_pass_all_test_reward/mean": 0.7093023061752319, "rewards/fixed_code_pass_all_test_reward/std": 0.36049193143844604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 460.75, "completions/mean_terminated_length": 460.75, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.3541781959048146, "frac_reward_zero_std": 0.0, "grad_norm": 0.59765625, "kl": 0.046897446271032095, "learning_rate": 1.9641608335256678e-05, "loss": 0.0019, "num_tokens": 16004421.0, "reward": 1.9147727489471436, "reward_std": 0.24105912446975708, "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.24105913937091827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.3543626637151817, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.045850432477891445, "learning_rate": 1.964075355390425e-05, "loss": 0.0018, "num_tokens": 16011854.0, "reward": 1.7569444179534912, "reward_std": 0.20126910507678986, "rewards/fixed_code_pass_all_test_reward/mean": 0.7569444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.20126911997795105, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 455.5, "completions/mean_terminated_length": 455.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3545471315255488, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.057785764336586, "learning_rate": 1.963989777306261e-05, "loss": 0.0023, "num_tokens": 16023890.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.35473159933591586, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.05538790486752987, "learning_rate": 1.963904099282047e-05, "loss": 0.0022, "num_tokens": 16033494.0, "reward": 1.6624999046325684, "reward_std": 0.1767766773700714, "rewards/fixed_code_pass_all_test_reward/mean": 0.6624999642372131, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.354916067146283, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.03891425649635494, "learning_rate": 1.9638183213266665e-05, "loss": 0.0016, "num_tokens": 16044286.0, "reward": 1.5283019542694092, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5283018946647644, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.35510053495665006, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.11269979272037745, "learning_rate": 1.9637324434490116e-05, "loss": 0.0045, "num_tokens": 16047861.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 400.75, "completions/mean_terminated_length": 400.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.35528500276701713, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.058071716921404004, "learning_rate": 1.9636464656579865e-05, "loss": 0.0023, "num_tokens": 16058915.0, "reward": 1.9287974834442139, "reward_std": 0.20139116048812866, "rewards/fixed_code_pass_all_test_reward/mean": 0.9287974834442139, "rewards/fixed_code_pass_all_test_reward/std": 0.20139117538928986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 639.25, "completions/mean_terminated_length": 639.25, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.35546947057738426, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.03500722371973097, "learning_rate": 1.9635603879625034e-05, "loss": 0.0014, "num_tokens": 16069469.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.35565393838775133, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.017614608630537987, "learning_rate": 1.9634742103714877e-05, "loss": 0.0007, "num_tokens": 16073340.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3558384061981184, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.06307699484750628, "learning_rate": 1.9633879328938724e-05, "loss": 0.0025, "num_tokens": 16084875.0, "reward": 1.9861111640930176, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.35602287400848553, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.05593383149243891, "learning_rate": 1.9633015555386033e-05, "loss": 0.0022, "num_tokens": 16096109.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.3562073418188526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.044489318039268255, "learning_rate": 1.9632150783146353e-05, "loss": 0.0018, "num_tokens": 16104179.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 392.5, "completions/mean_terminated_length": 392.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3563918096292197, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.016771857510320842, "learning_rate": 1.9631285012309332e-05, "loss": 0.0007, "num_tokens": 16111735.0, "reward": 1.7613637447357178, "reward_std": 0.32934945821762085, "rewards/fixed_code_pass_all_test_reward/mean": 0.7613636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.32934945821762085, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3565762774395868, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.07636563898995519, "learning_rate": 1.9630418242964727e-05, "loss": 0.0031, "num_tokens": 16117209.0, "reward": 1.5333333015441895, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5333333611488342, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 465.625, "completions/mean_terminated_length": 465.625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.3567607452499539, "frac_reward_zero_std": 0.0, "grad_norm": 0.6953125, "kl": 0.02686226787045598, "learning_rate": 1.9629550475202403e-05, "loss": 0.0011, "num_tokens": 16126486.0, "reward": 1.6354167461395264, "reward_std": 0.08838837593793869, "rewards/fixed_code_pass_all_test_reward/mean": 0.6354166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883610367775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 344.125, "completions/mean_terminated_length": 344.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.35694521306032095, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.0657939207740128, "learning_rate": 1.9628681709112326e-05, "loss": 0.0026, "num_tokens": 16135031.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 574.75, "completions/mean_terminated_length": 574.75, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.3571296808706881, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.11114898091182113, "learning_rate": 1.962781194478456e-05, "loss": 0.0044, "num_tokens": 16148621.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.35731414868105515, "frac_reward_zero_std": 1.0, "grad_norm": 1.1328125, "kl": 0.11419446673244238, "learning_rate": 1.962694118230928e-05, "loss": 0.0046, "num_tokens": 16153459.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3574986164914222, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.06929788598790765, "learning_rate": 1.9626069421776753e-05, "loss": 0.0028, "num_tokens": 16162490.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.35768308430178936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.07319977739825845, "learning_rate": 1.9625196663277368e-05, "loss": 0.0029, "num_tokens": 16171397.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.35786755211215643, "frac_reward_zero_std": 1.0, "grad_norm": 0.26171875, "kl": 0.038997467316221446, "learning_rate": 1.9624322906901596e-05, "loss": 0.0016, "num_tokens": 16177030.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3580520199225235, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.027502761920914054, "learning_rate": 1.962344815274003e-05, "loss": 0.0011, "num_tokens": 16182124.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.35823648773289063, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.055199426133185625, "learning_rate": 1.962257240088336e-05, "loss": 0.0022, "num_tokens": 16192655.0, "reward": 1.25, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3584209555432577, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.04376601963303983, "learning_rate": 1.962169565142237e-05, "loss": 0.0018, "num_tokens": 16199347.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.3586054233536248, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.04284642159473151, "learning_rate": 1.9620817904447964e-05, "loss": 0.0017, "num_tokens": 16206270.0, "reward": 1.98828125, "reward_std": 0.03314562886953354, "rewards/fixed_code_pass_all_test_reward/mean": 0.98828125, "rewards/fixed_code_pass_all_test_reward/std": 0.03314562886953354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3587898911639919, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.037382343667559326, "learning_rate": 1.9619939160051136e-05, "loss": 0.0015, "num_tokens": 16211536.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.358974358974359, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.015462528681382537, "learning_rate": 1.9619059418322987e-05, "loss": 0.0006, "num_tokens": 16218195.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 355.875, "completions/mean_terminated_length": 355.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.35915882678472605, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.022646317083854228, "learning_rate": 1.961817867935473e-05, "loss": 0.0009, "num_tokens": 16224634.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.3593432945950932, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.04282309440895915, "learning_rate": 1.9617296943237667e-05, "loss": 0.0017, "num_tokens": 16230556.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.35952776240546025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0390625, "kl": 0.019059712416492403, "learning_rate": 1.961641421006321e-05, "loss": 0.0008, "num_tokens": 16235480.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3597122302158273, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.054892236832529306, "learning_rate": 1.9615530479922885e-05, "loss": 0.0022, "num_tokens": 16240385.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.35989669802619445, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.0931207686662674, "learning_rate": 1.96146457529083e-05, "loss": 0.0037, "num_tokens": 16248357.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.3600811658365615, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.06492138048633933, "learning_rate": 1.9613760029111183e-05, "loss": 0.0026, "num_tokens": 16255192.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 684.5, "completions/mean_terminated_length": 684.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.3602656336469286, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.02135612870915793, "learning_rate": 1.9612873308623355e-05, "loss": 0.0009, "num_tokens": 16264196.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.3604501014572957, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.027234005508944392, "learning_rate": 1.9611985591536755e-05, "loss": 0.0011, "num_tokens": 16270749.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 321.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.3606345692676628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.04629516368731856, "learning_rate": 1.9611096877943404e-05, "loss": 0.0019, "num_tokens": 16277405.0, "reward": 1.6938775777816772, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6938775777816772, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.36081903707802987, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.05965818534605205, "learning_rate": 1.9610207167935446e-05, "loss": 0.0024, "num_tokens": 16287906.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.361003504888397, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.04203966353088617, "learning_rate": 1.960931646160512e-05, "loss": 0.0017, "num_tokens": 16298966.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 430.125, "completions/mean_terminated_length": 430.125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.36118797269876407, "frac_reward_zero_std": 1.0, "grad_norm": 1.09375, "kl": 0.10930985654704273, "learning_rate": 1.9608424759044763e-05, "loss": 0.0044, "num_tokens": 16306887.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 499.0, "completions/mean_terminated_length": 499.0, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.36137244050913114, "frac_reward_zero_std": 0.0, "grad_norm": 0.71875, "kl": 0.027180141711141914, "learning_rate": 1.9607532060346828e-05, "loss": 0.0011, "num_tokens": 16319527.0, "reward": 1.2291666269302368, "reward_std": 0.7586581110954285, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.29574811458587646, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 595.5, "completions/mean_terminated_length": 595.5, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.36155690831949827, "frac_reward_zero_std": 0.0, "grad_norm": 0.59375, "kl": 0.021035279147326946, "learning_rate": 1.9606638365603855e-05, "loss": 0.0008, "num_tokens": 16333363.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.36174137612986534, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.05404191114939749, "learning_rate": 1.9605743674908506e-05, "loss": 0.0022, "num_tokens": 16341038.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3619258439402324, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.07245515380054712, "learning_rate": 1.9604847988353528e-05, "loss": 0.0029, "num_tokens": 16350591.0, "reward": 1.383928656578064, "reward_std": 0.3873688876628876, "rewards/fixed_code_pass_all_test_reward/mean": 0.3839285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.3873688876628876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.36211031175059955, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.056247488828375936, "learning_rate": 1.9603951306031787e-05, "loss": 0.0023, "num_tokens": 16360707.0, "reward": 1.7939189672470093, "reward_std": 0.09438751637935638, "rewards/fixed_code_pass_all_test_reward/mean": 0.7939189672470093, "rewards/fixed_code_pass_all_test_reward/std": 0.09438753128051758, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 406.625, "completions/mean_terminated_length": 406.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3622947795609666, "frac_reward_zero_std": 1.0, "grad_norm": 0.017333984375, "kl": 0.01138675882248208, "learning_rate": 1.960305362803624e-05, "loss": 0.0005, "num_tokens": 16368216.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3624792473713337, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.022903859382495284, "learning_rate": 1.960215495445996e-05, "loss": 0.0009, "num_tokens": 16372628.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 424.0, "completions/mean_terminated_length": 424.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3626637151817008, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.048770251451060176, "learning_rate": 1.96012552853961e-05, "loss": 0.002, "num_tokens": 16385204.0, "reward": 1.6815476417541504, "reward_std": 0.1504165232181549, "rewards/fixed_code_pass_all_test_reward/mean": 0.6815476417541504, "rewards/fixed_code_pass_all_test_reward/std": 0.15041649341583252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 431.625, "completions/mean_terminated_length": 431.625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.3628481829920679, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.03717265767045319, "learning_rate": 1.960035462093795e-05, "loss": 0.0015, "num_tokens": 16393881.0, "reward": 1.828125, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 451.5, "completions/mean_terminated_length": 451.5, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.36303265080243496, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.046876417472958565, "learning_rate": 1.959945296117887e-05, "loss": 0.0019, "num_tokens": 16406581.0, "reward": 1.0508474111557007, "reward_std": 0.3406737148761749, "rewards/fixed_code_pass_all_test_reward/mean": 0.17584745585918427, "rewards/fixed_code_pass_all_test_reward/std": 0.061404142528772354, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3632171186128021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.039232010720297694, "learning_rate": 1.959855030621235e-05, "loss": 0.0016, "num_tokens": 16411691.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 404.75, "completions/mean_terminated_length": 404.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.36340158642316917, "frac_reward_zero_std": 1.0, "grad_norm": 0.023681640625, "kl": 0.02652712434064597, "learning_rate": 1.959764665613196e-05, "loss": 0.0011, "num_tokens": 16421193.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 350.75, "completions/mean_terminated_length": 350.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.36358605423353624, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.046083953231573105, "learning_rate": 1.9596742011031394e-05, "loss": 0.0018, "num_tokens": 16432591.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.36377052204390337, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.07487546419724822, "learning_rate": 1.9595836371004434e-05, "loss": 0.003, "num_tokens": 16438014.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.36395498985427044, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.040786647354252636, "learning_rate": 1.9594929736144978e-05, "loss": 0.0016, "num_tokens": 16446417.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.3641394576646375, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.05210401746444404, "learning_rate": 1.9594022106547007e-05, "loss": 0.0021, "num_tokens": 16452786.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3643239254750046, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.053174142027273774, "learning_rate": 1.959311348230463e-05, "loss": 0.0021, "num_tokens": 16462706.0, "reward": 1.6607142686843872, "reward_std": 0.527315080165863, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.24743583798408508, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 473.125, "completions/mean_terminated_length": 248.1428680419922, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.3645083932853717, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.04191380526754074, "learning_rate": 1.9592203863512038e-05, "loss": 0.0017, "num_tokens": 16474083.0, "reward": 1.1011905670166016, "reward_std": 0.4611133635044098, "rewards/fixed_code_pass_all_test_reward/mean": 0.2261904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.15165644884109497, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3646928610957388, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.09459242376033217, "learning_rate": 1.9591293250263542e-05, "loss": 0.0038, "num_tokens": 16480655.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.36487732890610586, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.05401582596823573, "learning_rate": 1.9590381642653546e-05, "loss": 0.0022, "num_tokens": 16489318.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.365061796716473, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.11960953380912542, "learning_rate": 1.9589469040776554e-05, "loss": 0.0048, "num_tokens": 16500055.0, "reward": 1.3636363744735718, "reward_std": 0.2571297585964203, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 509.0, "completions/mean_terminated_length": 509.0, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.36524626452684006, "frac_reward_zero_std": 0.0, "grad_norm": 0.60546875, "kl": 0.029337375657632947, "learning_rate": 1.9588555444727186e-05, "loss": 0.0012, "num_tokens": 16512055.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 223.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.36543073233720713, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.021767294965684414, "learning_rate": 1.9587640854600155e-05, "loss": 0.0009, "num_tokens": 16516920.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 506.0, "completions/mean_terminated_length": 506.0, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.36561520014757426, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.03980931709520519, "learning_rate": 1.9586725270490277e-05, "loss": 0.0016, "num_tokens": 16525264.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.36579966795794133, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.04554255283437669, "learning_rate": 1.9585808692492477e-05, "loss": 0.0018, "num_tokens": 16529598.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3659841357683084, "frac_reward_zero_std": 1.0, "grad_norm": 0.055908203125, "kl": 0.01147538810619153, "learning_rate": 1.958489112070178e-05, "loss": 0.0005, "num_tokens": 16535219.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 451.0, "completions/mean_terminated_length": 451.0, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.36616860357867553, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.051714921137318015, "learning_rate": 1.958397255521331e-05, "loss": 0.0021, "num_tokens": 16548395.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.3663530713890426, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.05783073417842388, "learning_rate": 1.9583052996122297e-05, "loss": 0.0023, "num_tokens": 16556432.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3665375391994097, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.05736004235222936, "learning_rate": 1.9582132443524084e-05, "loss": 0.0023, "num_tokens": 16567251.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 632.25, "completions/mean_terminated_length": 632.25, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.3667220070097768, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.04524946981109679, "learning_rate": 1.9581210897514097e-05, "loss": 0.0018, "num_tokens": 16583005.0, "reward": 1.5703125, "reward_std": 0.4869244694709778, "rewards/fixed_code_pass_all_test_reward/mean": 0.6953125, "rewards/fixed_code_pass_all_test_reward/std": 0.3314563035964966, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.3669064748201439, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.0239481118042022, "learning_rate": 1.9580288358187882e-05, "loss": 0.001, "num_tokens": 16587889.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.36709094263051095, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.07658073888160288, "learning_rate": 1.9579364825641082e-05, "loss": 0.0031, "num_tokens": 16592294.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 429.875, "completions/mean_terminated_length": 429.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.3672754104408781, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.03958742739632726, "learning_rate": 1.957844029996944e-05, "loss": 0.0016, "num_tokens": 16601677.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.36745987825124515, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.08483300544321537, "learning_rate": 1.95775147812688e-05, "loss": 0.0034, "num_tokens": 16607393.0, "reward": 1.4328703880310059, "reward_std": 0.4696279764175415, "rewards/fixed_code_pass_all_test_reward/mean": 0.43287038803100586, "rewards/fixed_code_pass_all_test_reward/std": 0.4696279466152191, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 557.25, "completions/mean_terminated_length": 344.2857360839844, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.3676443460616122, "frac_reward_zero_std": 0.0, "grad_norm": 0.59375, "kl": 0.04511245700996369, "learning_rate": 1.9576588269635123e-05, "loss": 0.0018, "num_tokens": 16616739.0, "reward": 1.4821429252624512, "reward_std": 0.744023859500885, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.5050762891769409, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.36782881387197935, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.06390029191970825, "learning_rate": 1.9575660765164462e-05, "loss": 0.0026, "num_tokens": 16625886.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3680132816823464, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.049923276295885444, "learning_rate": 1.9574732267952972e-05, "loss": 0.002, "num_tokens": 16633764.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3681977494927135, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.07434457261115313, "learning_rate": 1.957380277809691e-05, "loss": 0.003, "num_tokens": 16642320.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 327.625, "completions/mean_terminated_length": 327.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.36838221730308063, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.09358723415061831, "learning_rate": 1.9572872295692643e-05, "loss": 0.0037, "num_tokens": 16651141.0, "reward": 1.84375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.3685666851134477, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.05443906970322132, "learning_rate": 1.9571940820836638e-05, "loss": 0.0022, "num_tokens": 16655628.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.3687511529238148, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.06278847018256783, "learning_rate": 1.9571008353625466e-05, "loss": 0.0025, "num_tokens": 16661999.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 453.375, "completions/mean_terminated_length": 453.375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3689356207341819, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.03801625268533826, "learning_rate": 1.9570074894155792e-05, "loss": 0.0015, "num_tokens": 16671474.0, "reward": 1.5130208730697632, "reward_std": 0.4067171812057495, "rewards/fixed_code_pass_all_test_reward/mean": 0.6380208730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.257799357175827, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 488.125, "completions/mean_terminated_length": 488.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.369120088544549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0198974609375, "kl": 0.007652744097867981, "learning_rate": 1.9569140442524396e-05, "loss": 0.0003, "num_tokens": 16678843.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.36930455635491605, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.05371675547212362, "learning_rate": 1.9568204998828157e-05, "loss": 0.0021, "num_tokens": 16685948.0, "reward": 1.4153225421905518, "reward_std": 0.4841589331626892, "rewards/fixed_code_pass_all_test_reward/mean": 0.41532257199287415, "rewards/fixed_code_pass_all_test_reward/std": 0.4841589033603668, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 579.375, "completions/mean_terminated_length": 579.375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.3694890241652832, "frac_reward_zero_std": 0.0, "grad_norm": 0.41796875, "kl": 0.022888626786880195, "learning_rate": 1.9567268563164052e-05, "loss": 0.0009, "num_tokens": 16698271.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.36967349197565025, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.044051578268408775, "learning_rate": 1.9566331135629166e-05, "loss": 0.0018, "num_tokens": 16710639.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 582.25, "completions/mean_terminated_length": 582.25, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.3698579597860173, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.017927406821399927, "learning_rate": 1.9565392716320685e-05, "loss": 0.0007, "num_tokens": 16720793.0, "reward": 1.8928570747375488, "reward_std": 0.19839003682136536, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.19839002192020416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.37004242759638445, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.07648348552174866, "learning_rate": 1.9564453305335896e-05, "loss": 0.0031, "num_tokens": 16730624.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3702268954067515, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.04902959894388914, "learning_rate": 1.9563512902772197e-05, "loss": 0.002, "num_tokens": 16738541.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3704113632171186, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.03605345683172345, "learning_rate": 1.956257150872708e-05, "loss": 0.0014, "num_tokens": 16747942.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3705958310274857, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.059115244541317225, "learning_rate": 1.9561629123298133e-05, "loss": 0.0024, "num_tokens": 16756905.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3707802988378528, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.07784890243783593, "learning_rate": 1.956068574658307e-05, "loss": 0.0031, "num_tokens": 16764890.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.37096476664821987, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.03753680735826492, "learning_rate": 1.9559741378679686e-05, "loss": 0.0015, "num_tokens": 16773230.0, "reward": 1.7857142686843872, "reward_std": 0.18177399039268494, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.18177400529384613, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.371149234458587, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.05322813929524273, "learning_rate": 1.955879601968589e-05, "loss": 0.0021, "num_tokens": 16783281.0, "reward": 1.0480769872665405, "reward_std": 0.5363971590995789, "rewards/fixed_code_pass_all_test_reward/mean": 0.17307692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.33655810356140137, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.37133370226895407, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.031371780671179295, "learning_rate": 1.9557849669699693e-05, "loss": 0.0013, "num_tokens": 16789243.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.37151817007932114, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.10677461046725512, "learning_rate": 1.9556902328819204e-05, "loss": 0.0043, "num_tokens": 16798713.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.37170263788968827, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.03444643720285967, "learning_rate": 1.955595399714263e-05, "loss": 0.0014, "num_tokens": 16803409.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.37188710570005534, "frac_reward_zero_std": 1.0, "grad_norm": 0.037353515625, "kl": 0.026809882023371756, "learning_rate": 1.9555004674768295e-05, "loss": 0.0011, "num_tokens": 16810430.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.3720715735104224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.07040323503315449, "learning_rate": 1.955405436179462e-05, "loss": 0.0028, "num_tokens": 16817065.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 342.0, "completions/mean_terminated_length": 342.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.37225604132078954, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04891491308808327, "learning_rate": 1.955310305832012e-05, "loss": 0.002, "num_tokens": 16824465.0, "reward": 1.6041667461395264, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.3724405091311566, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.04219663317780942, "learning_rate": 1.955215076444343e-05, "loss": 0.0017, "num_tokens": 16833437.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3726249769415237, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.027353938901796937, "learning_rate": 1.9551197480263268e-05, "loss": 0.0011, "num_tokens": 16839454.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 459.5, "completions/mean_terminated_length": 459.5, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.3728094447518908, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.046773074893280864, "learning_rate": 1.955024320587847e-05, "loss": 0.0019, "num_tokens": 16848418.0, "reward": 1.4861111640930176, "reward_std": 0.4966821074485779, "rewards/fixed_code_pass_all_test_reward/mean": 0.6111111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.4824979305267334, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 379.75, "completions/mean_terminated_length": 379.75, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.3729939125622579, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.07454425725154579, "learning_rate": 1.9549287941387966e-05, "loss": 0.003, "num_tokens": 16858448.0, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.37317838037262496, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.03552156710065901, "learning_rate": 1.9548331686890793e-05, "loss": 0.0014, "num_tokens": 16868427.0, "reward": 1.953125, "reward_std": 0.13258251547813416, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3733628481829921, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.06974249822087586, "learning_rate": 1.9547374442486086e-05, "loss": 0.0028, "num_tokens": 16873514.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.37354731599335916, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.060181002132594585, "learning_rate": 1.9546416208273086e-05, "loss": 0.0024, "num_tokens": 16883558.0, "reward": 1.3611111640930176, "reward_std": 0.07856737822294235, "rewards/fixed_code_pass_all_test_reward/mean": 0.3611111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.07856743037700653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.37373178380372624, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.03448632266372442, "learning_rate": 1.954545698435114e-05, "loss": 0.0014, "num_tokens": 16888217.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 398.875, "completions/mean_terminated_length": 398.875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.37391625161409336, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.06144628161564469, "learning_rate": 1.9544496770819692e-05, "loss": 0.0025, "num_tokens": 16900648.0, "reward": 1.6931817531585693, "reward_std": 0.33291494846343994, "rewards/fixed_code_pass_all_test_reward/mean": 0.6931818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.33291494846343994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.37410071942446044, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.06268365075811744, "learning_rate": 1.9543535567778293e-05, "loss": 0.0025, "num_tokens": 16907152.0, "reward": 1.6691176891326904, "reward_std": 0.35004061460494995, "rewards/fixed_code_pass_all_test_reward/mean": 0.7941176891326904, "rewards/fixed_code_pass_all_test_reward/std": 0.04159451276063919, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 372.625, "completions/mean_terminated_length": 372.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.3742851872348275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.042413814924657345, "learning_rate": 1.9542573375326592e-05, "loss": 0.0017, "num_tokens": 16915013.0, "reward": 1.6363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.37446965504519464, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.050219545140862465, "learning_rate": 1.954161019356434e-05, "loss": 0.002, "num_tokens": 16919804.0, "reward": 0.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3746541228555617, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.04159138537943363, "learning_rate": 1.95406460225914e-05, "loss": 0.0017, "num_tokens": 16923978.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3748385906659288, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.049045275431126356, "learning_rate": 1.953968086250772e-05, "loss": 0.002, "num_tokens": 16928346.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.3750230584762959, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.12399580841884017, "learning_rate": 1.9538714713413373e-05, "loss": 0.005, "num_tokens": 16938781.0, "reward": 1.4930555820465088, "reward_std": 0.2438620775938034, "rewards/fixed_code_pass_all_test_reward/mean": 0.4930555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.2438620626926422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.375207526286663, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.05269163614138961, "learning_rate": 1.9537747575408515e-05, "loss": 0.0021, "num_tokens": 16948673.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.37539199409703006, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.044148302054964006, "learning_rate": 1.9536779448593418e-05, "loss": 0.0018, "num_tokens": 16952819.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.3755764619073972, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.03891810623463243, "learning_rate": 1.953581033306845e-05, "loss": 0.0016, "num_tokens": 16959408.0, "reward": 1.9305555820465088, "reward_std": 0.19641853868961334, "rewards/fixed_code_pass_all_test_reward/mean": 0.9305555820465088, "rewards/fixed_code_pass_all_test_reward/std": 0.19641855359077454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.37576092971776426, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.04182920535095036, "learning_rate": 1.9534840228934077e-05, "loss": 0.0017, "num_tokens": 16967001.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.37594539752813133, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.026598265510983765, "learning_rate": 1.9533869136290882e-05, "loss": 0.0011, "num_tokens": 16973295.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.37612986533849846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.03392723994329572, "learning_rate": 1.953289705523953e-05, "loss": 0.0014, "num_tokens": 16977837.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.37631433314886553, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.055806771386414766, "learning_rate": 1.953192398588081e-05, "loss": 0.0022, "num_tokens": 16981845.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3764988009592326, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.05344292731024325, "learning_rate": 1.9530949928315598e-05, "loss": 0.0021, "num_tokens": 16990914.0, "reward": 1.8624999523162842, "reward_std": 0.11386082321405411, "rewards/fixed_code_pass_all_test_reward/mean": 0.8624999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.1138608381152153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 299.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3766832687695997, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.06335927359759808, "learning_rate": 1.9529974882644882e-05, "loss": 0.0025, "num_tokens": 16997436.0, "reward": 1.798076868057251, "reward_std": 0.29056012630462646, "rewards/fixed_code_pass_all_test_reward/mean": 0.7980769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.29056015610694885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3768677365799668, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0410170522518456, "learning_rate": 1.9528998848969746e-05, "loss": 0.0016, "num_tokens": 17008146.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 417.5, "completions/mean_terminated_length": 417.5, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.3770522043903339, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.03826856892555952, "learning_rate": 1.9528021827391376e-05, "loss": 0.0015, "num_tokens": 17016030.0, "reward": 1.8081395626068115, "reward_std": 0.245723694562912, "rewards/fixed_code_pass_all_test_reward/mean": 0.8081395626068115, "rewards/fixed_code_pass_all_test_reward/std": 0.245723694562912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.37723667220070095, "frac_reward_zero_std": 0.0, "grad_norm": 0.6953125, "kl": 0.03932886617258191, "learning_rate": 1.9527043818011063e-05, "loss": 0.0016, "num_tokens": 17026671.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3774211400110681, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04826147947460413, "learning_rate": 1.9526064820930205e-05, "loss": 0.0019, "num_tokens": 17036082.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 581.75, "completions/mean_terminated_length": 581.75, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.37760560782143515, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.031019500456750393, "learning_rate": 1.9525084836250296e-05, "loss": 0.0012, "num_tokens": 17051008.0, "reward": 1.6083333492279053, "reward_std": 0.39107102155685425, "rewards/fixed_code_pass_all_test_reward/mean": 0.6083333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.39107099175453186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.3777900756318022, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.05638855532743037, "learning_rate": 1.9524103864072933e-05, "loss": 0.0023, "num_tokens": 17055485.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.37797454344216935, "frac_reward_zero_std": 1.0, "grad_norm": 0.056884765625, "kl": 0.020734177203848958, "learning_rate": 1.952312190449982e-05, "loss": 0.0008, "num_tokens": 17061617.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 359.75, "completions/mean_terminated_length": 359.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.3781590112525364, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.04868621751666069, "learning_rate": 1.9522138957632758e-05, "loss": 0.0019, "num_tokens": 17071583.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 563.625, "completions/mean_terminated_length": 563.625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.3783434790629035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.03754097945056856, "learning_rate": 1.9521155023573648e-05, "loss": 0.0015, "num_tokens": 17087164.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 521.125, "completions/mean_terminated_length": 521.125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.3785279468732706, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.04583070264197886, "learning_rate": 1.9520170102424506e-05, "loss": 0.0018, "num_tokens": 17096629.0, "reward": 0.3583333492279053, "reward_std": 0.6826070547103882, "rewards/fixed_code_pass_all_test_reward/mean": 0.10833333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.2568119466304779, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.3787124146836377, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.04443188733421266, "learning_rate": 1.9519184194287434e-05, "loss": 0.0018, "num_tokens": 17102482.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.37889688249400477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.03437469992786646, "learning_rate": 1.9518197299264652e-05, "loss": 0.0014, "num_tokens": 17111489.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 599.125, "completions/mean_terminated_length": 599.125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.3790813503043719, "frac_reward_zero_std": 0.0, "grad_norm": 0.6953125, "kl": 0.04572005337104201, "learning_rate": 1.951720941745847e-05, "loss": 0.0018, "num_tokens": 17124202.0, "reward": 1.1666667461395264, "reward_std": 0.3421454131603241, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.3421454429626465, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.379265818114739, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.09020740818232298, "learning_rate": 1.951622054897131e-05, "loss": 0.0036, "num_tokens": 17134042.0, "reward": 1.5297619104385376, "reward_std": 0.2607426047325134, "rewards/fixed_code_pass_all_test_reward/mean": 0.5297619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.2607426345348358, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.37945028592510605, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.07114747120067477, "learning_rate": 1.9515230693905682e-05, "loss": 0.0028, "num_tokens": 17142521.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 392.0, "completions/mean_terminated_length": 392.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3796347537354732, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.07602614630013704, "learning_rate": 1.9514239852364214e-05, "loss": 0.003, "num_tokens": 17152953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.37981922154584025, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.05567008024081588, "learning_rate": 1.951324802444963e-05, "loss": 0.0022, "num_tokens": 17157671.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3800036893562073, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.045564227970317006, "learning_rate": 1.9512255210264755e-05, "loss": 0.0018, "num_tokens": 17164819.0, "reward": 1.509615421295166, "reward_std": 0.47407424449920654, "rewards/fixed_code_pass_all_test_reward/mean": 0.5096153616905212, "rewards/fixed_code_pass_all_test_reward/std": 0.47407427430152893, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 511.625, "completions/mean_terminated_length": 511.625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.38018815716657445, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.046960206469520926, "learning_rate": 1.9511261409912515e-05, "loss": 0.0019, "num_tokens": 17174152.0, "reward": 1.5546875, "reward_std": 0.3230726718902588, "rewards/fixed_code_pass_all_test_reward/mean": 0.5546875, "rewards/fixed_code_pass_all_test_reward/std": 0.3230726420879364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 468.5, "completions/mean_terminated_length": 468.5, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.3803726249769415, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.05107648251578212, "learning_rate": 1.9510266623495947e-05, "loss": 0.002, "num_tokens": 17185076.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 490.75, "completions/mean_terminated_length": 490.75, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.3805570927873086, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.05041854712180793, "learning_rate": 1.9509270851118175e-05, "loss": 0.002, "num_tokens": 17195266.0, "reward": 1.6607142686843872, "reward_std": 0.716069221496582, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 557.75, "completions/mean_terminated_length": 557.75, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.3807415605976757, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.07225440349429846, "learning_rate": 1.950827409288244e-05, "loss": 0.0029, "num_tokens": 17208736.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 548.125, "completions/mean_terminated_length": 548.125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.3809260284080428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.07840250292792916, "learning_rate": 1.9507276348892085e-05, "loss": 0.0031, "num_tokens": 17222073.0, "reward": 0.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.38111049621840987, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.07094631530344486, "learning_rate": 1.9506277619250535e-05, "loss": 0.0028, "num_tokens": 17231919.0, "reward": 1.524193525314331, "reward_std": 0.4344158172607422, "rewards/fixed_code_pass_all_test_reward/mean": 0.524193525314331, "rewards/fixed_code_pass_all_test_reward/std": 0.4344158172607422, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 296.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.381294964028777, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.030968716484494507, "learning_rate": 1.9505277904061343e-05, "loss": 0.0012, "num_tokens": 17238643.0, "reward": 1.8916666507720947, "reward_std": 0.3064129650592804, "rewards/fixed_code_pass_all_test_reward/mean": 0.8916666507720947, "rewards/fixed_code_pass_all_test_reward/std": 0.306412935256958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.38147943183914407, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.05680610775016248, "learning_rate": 1.9504277203428148e-05, "loss": 0.0023, "num_tokens": 17248619.0, "reward": 1.8571429252624512, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3499270975589752, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.38166389964951114, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.09418758423998952, "learning_rate": 1.9503275517454696e-05, "loss": 0.0038, "num_tokens": 17252660.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.38184836745987827, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.02664883271791041, "learning_rate": 1.9502272846244835e-05, "loss": 0.0011, "num_tokens": 17259290.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 640.75, "completions/mean_terminated_length": 171.6666717529297, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.38203283527024534, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.053099905722774565, "learning_rate": 1.950126918990252e-05, "loss": 0.0021, "num_tokens": 17267288.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.3822173030806124, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.04842101130634546, "learning_rate": 1.95002645485318e-05, "loss": 0.0019, "num_tokens": 17276767.0, "reward": 1.5, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 329.0, "completions/mean_terminated_length": 329.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.38240177089097954, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.05296260421164334, "learning_rate": 1.9499258922236825e-05, "loss": 0.0021, "num_tokens": 17286327.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 459.0, "completions/mean_terminated_length": 459.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3825862387013466, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.029027825919911265, "learning_rate": 1.949825231112186e-05, "loss": 0.0012, "num_tokens": 17296679.0, "reward": 1.8645833730697632, "reward_std": 0.25074291229248047, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.25074294209480286, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 375.25, "completions/mean_terminated_length": 375.25, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3827707065117137, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.06703383522108197, "learning_rate": 1.949724471529126e-05, "loss": 0.0027, "num_tokens": 17304273.0, "reward": 1.375, "reward_std": 0.13363061845302582, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 433.625, "completions/mean_terminated_length": 433.625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.3829551743220808, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.060566093772649765, "learning_rate": 1.9496236134849485e-05, "loss": 0.0024, "num_tokens": 17312598.0, "reward": 1.6041666269302368, "reward_std": 0.4266657531261444, "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.426665723323822, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3831396421324479, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.06383937736973166, "learning_rate": 1.9495226569901098e-05, "loss": 0.0026, "num_tokens": 17323864.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.38332410994281496, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.04194837110117078, "learning_rate": 1.9494216020550765e-05, "loss": 0.0017, "num_tokens": 17328165.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3835085777531821, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.04810000048018992, "learning_rate": 1.9493204486903252e-05, "loss": 0.0019, "num_tokens": 17336731.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.38369304556354916, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0708970776759088, "learning_rate": 1.9492191969063427e-05, "loss": 0.0028, "num_tokens": 17344352.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.38387751337391623, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.050205412320792675, "learning_rate": 1.9491178467136265e-05, "loss": 0.002, "num_tokens": 17353952.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.38406198118428336, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.06541570695117116, "learning_rate": 1.9490163981226836e-05, "loss": 0.0026, "num_tokens": 17361737.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.38424644899465044, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05118050426244736, "learning_rate": 1.948914851144032e-05, "loss": 0.002, "num_tokens": 17370454.0, "reward": 1.9318182468414307, "reward_std": 0.09409989416599274, "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181276321411, "rewards/fixed_code_pass_all_test_reward/std": 0.09409984946250916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.3844309168050175, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.0463952929712832, "learning_rate": 1.9488132057881984e-05, "loss": 0.0019, "num_tokens": 17377419.0, "reward": 1.2959184646606445, "reward_std": 0.3390458822250366, "rewards/fixed_code_pass_all_test_reward/mean": 0.42091837525367737, "rewards/fixed_code_pass_all_test_reward/std": 0.06164819374680519, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.38461538461538464, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.03915958805009723, "learning_rate": 1.9487114620657216e-05, "loss": 0.0016, "num_tokens": 17382205.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3847998524257517, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.06372892530634999, "learning_rate": 1.9486096199871497e-05, "loss": 0.0025, "num_tokens": 17391594.0, "reward": 1.798387050628662, "reward_std": 0.3733145296573639, "rewards/fixed_code_pass_all_test_reward/mean": 0.7983871102333069, "rewards/fixed_code_pass_all_test_reward/std": 0.3733145594596863, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 383.75, "completions/mean_terminated_length": 383.75, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.3849843202361188, "frac_reward_zero_std": 1.0, "grad_norm": 0.2060546875, "kl": 0.05129766184836626, "learning_rate": 1.9485076795630406e-05, "loss": 0.0021, "num_tokens": 17399824.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3851687880464859, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.02355732163414359, "learning_rate": 1.948405640803963e-05, "loss": 0.0009, "num_tokens": 17409866.0, "reward": 1.46875, "reward_std": 0.4712729752063751, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.385353255856853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.025293090206105262, "learning_rate": 1.9483035037204955e-05, "loss": 0.001, "num_tokens": 17415406.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.38553772366722006, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.03329848707653582, "learning_rate": 1.948201268323227e-05, "loss": 0.0013, "num_tokens": 17420518.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3857221914775872, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05645864538382739, "learning_rate": 1.9480989346227565e-05, "loss": 0.0023, "num_tokens": 17431089.0, "reward": 1.7620967626571655, "reward_std": 0.44126981496810913, "rewards/fixed_code_pass_all_test_reward/mean": 0.7620967626571655, "rewards/fixed_code_pass_all_test_reward/std": 0.44126981496810913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 540.0, "completions/mean_terminated_length": 540.0, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.38590665928795426, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.03197801043279469, "learning_rate": 1.9479965026296938e-05, "loss": 0.0013, "num_tokens": 17441361.0, "reward": 1.6510417461395264, "reward_std": 0.22097088396549225, "rewards/fixed_code_pass_all_test_reward/mean": 0.6510416865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.38609112709832133, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.02311966847628355, "learning_rate": 1.947893972354658e-05, "loss": 0.0009, "num_tokens": 17446990.0, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.38627559490868846, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.03848932380788028, "learning_rate": 1.9477913438082785e-05, "loss": 0.0015, "num_tokens": 17456292.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.38646006271905553, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.09161822684109211, "learning_rate": 1.9476886170011955e-05, "loss": 0.0037, "num_tokens": 17465129.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.3866445305294226, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.03116665210109204, "learning_rate": 1.947585791944059e-05, "loss": 0.0012, "num_tokens": 17473950.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.38682899833978973, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.0637482744641602, "learning_rate": 1.9474828686475286e-05, "loss": 0.0025, "num_tokens": 17479189.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3870134661501568, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.04604291799478233, "learning_rate": 1.947379847122276e-05, "loss": 0.0018, "num_tokens": 17482879.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3871979339605239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.06750999251380563, "learning_rate": 1.9472767273789804e-05, "loss": 0.0027, "num_tokens": 17491914.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.387382401770891, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.07673660106956959, "learning_rate": 1.9471735094283337e-05, "loss": 0.0031, "num_tokens": 17495752.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3875668695812581, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.03214054787531495, "learning_rate": 1.9470701932810364e-05, "loss": 0.0013, "num_tokens": 17500995.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.38775133739162515, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.058309352956712246, "learning_rate": 1.946966778947799e-05, "loss": 0.0023, "num_tokens": 17507595.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.3879358052019923, "frac_reward_zero_std": 1.0, "grad_norm": 0.1611328125, "kl": 0.11231824476271868, "learning_rate": 1.946863266439344e-05, "loss": 0.0045, "num_tokens": 17514197.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.38812027301235935, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.0626998080406338, "learning_rate": 1.9467596557664018e-05, "loss": 0.0025, "num_tokens": 17522012.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3883047408227264, "frac_reward_zero_std": 1.0, "grad_norm": 0.185546875, "kl": 0.06708673760294914, "learning_rate": 1.946655946939715e-05, "loss": 0.0027, "num_tokens": 17531029.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.38848920863309355, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.08421134203672409, "learning_rate": 1.9465521399700346e-05, "loss": 0.0034, "num_tokens": 17538282.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3886736764434606, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.07015104405581951, "learning_rate": 1.9464482348681232e-05, "loss": 0.0028, "num_tokens": 17543132.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3888581442538277, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.056338427821174264, "learning_rate": 1.9463442316447527e-05, "loss": 0.0023, "num_tokens": 17547753.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.38904261206419477, "frac_reward_zero_std": 0.0, "grad_norm": 0.69140625, "kl": 0.043425839161500335, "learning_rate": 1.946240130310706e-05, "loss": 0.0017, "num_tokens": 17561623.0, "reward": 1.0204918384552002, "reward_std": 0.057959601283073425, "rewards/fixed_code_pass_all_test_reward/mean": 0.02049180306494236, "rewards/fixed_code_pass_all_test_reward/std": 0.05795957148075104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3892270798745619, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.0684242770075798, "learning_rate": 1.9461359308767745e-05, "loss": 0.0027, "num_tokens": 17570264.0, "reward": 1.807692289352417, "reward_std": 0.376844584941864, "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.19037489593029022, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.38941154768492897, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.06828240863978863, "learning_rate": 1.946031633353762e-05, "loss": 0.0027, "num_tokens": 17580990.0, "reward": 1.8729166984558105, "reward_std": 0.14084140956401825, "rewards/fixed_code_pass_all_test_reward/mean": 0.8729166984558105, "rewards/fixed_code_pass_all_test_reward/std": 0.14084143936634064, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.38959601549529604, "frac_reward_zero_std": 1.0, "grad_norm": 0.039306640625, "kl": 0.029277416004333645, "learning_rate": 1.945927237752481e-05, "loss": 0.0012, "num_tokens": 17587582.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.3897804833056632, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.05177688947878778, "learning_rate": 1.9458227440837545e-05, "loss": 0.0021, "num_tokens": 17597092.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.38996495111603025, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.053191795479506254, "learning_rate": 1.9457181523584155e-05, "loss": 0.0021, "num_tokens": 17602000.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 351.0, "completions/mean_terminated_length": 351.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.3901494189263973, "frac_reward_zero_std": 1.0, "grad_norm": 0.035888671875, "kl": 0.02781262795906514, "learning_rate": 1.9456134625873076e-05, "loss": 0.0011, "num_tokens": 17609856.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.39033388673676445, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.07971406308934093, "learning_rate": 1.9455086747812847e-05, "loss": 0.0032, "num_tokens": 17614141.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3905183545471315, "frac_reward_zero_std": 1.0, "grad_norm": 0.051025390625, "kl": 0.04130468424409628, "learning_rate": 1.94540378895121e-05, "loss": 0.0017, "num_tokens": 17622263.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3907028223574986, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.04059723252430558, "learning_rate": 1.9452988051079572e-05, "loss": 0.0016, "num_tokens": 17627761.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.3908872901678657, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.05294724949635565, "learning_rate": 1.945193723262411e-05, "loss": 0.0021, "num_tokens": 17632040.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3910717579782328, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.08972419775091112, "learning_rate": 1.945088543425465e-05, "loss": 0.0036, "num_tokens": 17640754.0, "reward": 1.8863636255264282, "reward_std": 0.32141217589378357, "rewards/fixed_code_pass_all_test_reward/mean": 0.8863636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.32141217589378357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.39125622578859987, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04183103935793042, "learning_rate": 1.9449832656080237e-05, "loss": 0.0017, "num_tokens": 17649471.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.391440693598967, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.05508595984429121, "learning_rate": 1.944877889821002e-05, "loss": 0.0022, "num_tokens": 17655890.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 233.875, "completions/mean_terminated_length": 233.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.39162516140933407, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.04842102574184537, "learning_rate": 1.9447724160753242e-05, "loss": 0.0019, "num_tokens": 17660665.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.39180962921970114, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.042912264936603606, "learning_rate": 1.9446668443819247e-05, "loss": 0.0017, "num_tokens": 17669615.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 348.75, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.39199409703006827, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.05266399076208472, "learning_rate": 1.944561174751749e-05, "loss": 0.0021, "num_tokens": 17680661.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 401.75, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.39217856484043534, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.028338501462712884, "learning_rate": 1.9444554071957523e-05, "loss": 0.0011, "num_tokens": 17690715.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.3923630326508024, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0769399469718337, "learning_rate": 1.9443495417249e-05, "loss": 0.0031, "num_tokens": 17694534.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 485.0, "completions/mean_terminated_length": 261.71429443359375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.39254750046116954, "frac_reward_zero_std": 0.0, "grad_norm": 0.61328125, "kl": 0.04944518709089607, "learning_rate": 1.944243578350167e-05, "loss": 0.002, "num_tokens": 17703638.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3927319682715366, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.06131342728622258, "learning_rate": 1.944137517082539e-05, "loss": 0.0025, "num_tokens": 17711087.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3929164360819037, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.04012003121897578, "learning_rate": 1.9440313579330122e-05, "loss": 0.0016, "num_tokens": 17716545.0, "reward": 1.8249999284744263, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.8250000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 436.875, "completions/mean_terminated_length": 436.875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3931009038922708, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0478950230171904, "learning_rate": 1.9439251009125917e-05, "loss": 0.0019, "num_tokens": 17727456.0, "reward": 1.9943182468414307, "reward_std": 0.0035068909637629986, "rewards/fixed_code_pass_all_test_reward/mean": 0.9943181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0035068909637629986, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.3932853717026379, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.05803209752775729, "learning_rate": 1.9438187460322943e-05, "loss": 0.0023, "num_tokens": 17736824.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.39346983951300496, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.023261202848516405, "learning_rate": 1.9437122933031454e-05, "loss": 0.0009, "num_tokens": 17744668.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3936543073233721, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.02751844155136496, "learning_rate": 1.943605742736182e-05, "loss": 0.0011, "num_tokens": 17749021.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.39383877513373916, "frac_reward_zero_std": 0.0, "grad_norm": 4.46875, "kl": 0.286012418102473, "learning_rate": 1.9434990943424503e-05, "loss": 0.0114, "num_tokens": 17752765.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.39402324294410623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0361328125, "kl": 0.023990232264623046, "learning_rate": 1.9433923481330067e-05, "loss": 0.001, "num_tokens": 17759362.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.39420771075447336, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.052420936757698655, "learning_rate": 1.943285504118918e-05, "loss": 0.0021, "num_tokens": 17768801.0, "reward": 1.6671512126922607, "reward_std": 0.41614899039268494, "rewards/fixed_code_pass_all_test_reward/mean": 0.6671512126922607, "rewards/fixed_code_pass_all_test_reward/std": 0.4161490201950073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.39439217856484043, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04765080125071108, "learning_rate": 1.9431785623112616e-05, "loss": 0.0019, "num_tokens": 17775321.0, "reward": 1.884615421295166, "reward_std": 0.21365076303482056, "rewards/fixed_code_pass_all_test_reward/mean": 0.884615421295166, "rewards/fixed_code_pass_all_test_reward/std": 0.21365077793598175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 470.0, "completions/mean_terminated_length": 470.0, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.3945766463752075, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.03481989342253655, "learning_rate": 1.943071522721124e-05, "loss": 0.0014, "num_tokens": 17790289.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.39476111418557464, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.06890724692493677, "learning_rate": 1.9429643853596025e-05, "loss": 0.0028, "num_tokens": 17799171.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 451.625, "completions/mean_terminated_length": 451.625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.3949455819959417, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.046528360806405544, "learning_rate": 1.9428571502378043e-05, "loss": 0.0019, "num_tokens": 17808088.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3951300498063088, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05304823839105666, "learning_rate": 1.9427498173668467e-05, "loss": 0.0021, "num_tokens": 17812357.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 447.625, "completions/mean_terminated_length": 447.625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.3953145176166759, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.044579875422641635, "learning_rate": 1.9426423867578576e-05, "loss": 0.0018, "num_tokens": 17822018.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 733.125, "completions/mean_terminated_length": 733.125, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.395498985427043, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.02649968327023089, "learning_rate": 1.9425348584219745e-05, "loss": 0.0011, "num_tokens": 17835291.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.39568345323741005, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05940220458433032, "learning_rate": 1.9424272323703453e-05, "loss": 0.0024, "num_tokens": 17844879.0, "reward": 1.615384578704834, "reward_std": 0.4111711084842682, "rewards/fixed_code_pass_all_test_reward/mean": 0.6153846383094788, "rewards/fixed_code_pass_all_test_reward/std": 0.41117116808891296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 335.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3958679210477772, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.060845807660371065, "learning_rate": 1.942319508614128e-05, "loss": 0.0024, "num_tokens": 17854407.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 281.875, "completions/mean_terminated_length": 281.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.39605238885814426, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.05791920842602849, "learning_rate": 1.94221168716449e-05, "loss": 0.0023, "num_tokens": 17864494.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 103.375, "completions/mean_terminated_length": 103.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.39623685666851133, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.05569723201915622, "learning_rate": 1.9421037680326106e-05, "loss": 0.0022, "num_tokens": 17868057.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 323.375, "completions/mean_terminated_length": 323.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.39642132447887846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.03214497282169759, "learning_rate": 1.9419957512296775e-05, "loss": 0.0013, "num_tokens": 17874348.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.39660579228924553, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.049673862056806684, "learning_rate": 1.941887636766889e-05, "loss": 0.002, "num_tokens": 17883403.0, "reward": 1.6331522464752197, "reward_std": 0.30377885699272156, "rewards/fixed_code_pass_all_test_reward/mean": 0.633152186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.30377885699272156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.3967902600996126, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.08691768813878298, "learning_rate": 1.9417794246554546e-05, "loss": 0.0035, "num_tokens": 17892903.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 449.125, "completions/mean_terminated_length": 449.125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.39697472790997973, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.04274927731603384, "learning_rate": 1.941671114906592e-05, "loss": 0.0017, "num_tokens": 17901632.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 631.25, "completions/mean_terminated_length": 631.25, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.3971591957203468, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.05477748205885291, "learning_rate": 1.9415627075315305e-05, "loss": 0.0022, "num_tokens": 17916698.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3973436635307139, "frac_reward_zero_std": 0.0, "grad_norm": 4.21875, "kl": 0.3450149307027459, "learning_rate": 1.9414542025415088e-05, "loss": 0.0138, "num_tokens": 17920289.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 463.375, "completions/mean_terminated_length": 463.375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.397528131341081, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05411734222434461, "learning_rate": 1.9413455999477763e-05, "loss": 0.0022, "num_tokens": 17933716.0, "reward": 1.59375, "reward_std": 0.1735912710428238, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.1735912710428238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3977125991514481, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.039956387830898166, "learning_rate": 1.941236899761592e-05, "loss": 0.0016, "num_tokens": 17943888.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.39789706696181515, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.05863950354978442, "learning_rate": 1.9411281019942255e-05, "loss": 0.0023, "num_tokens": 17949328.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3980815347721823, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.06123341480270028, "learning_rate": 1.941019206656956e-05, "loss": 0.0024, "num_tokens": 17953586.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.39826600258254935, "frac_reward_zero_std": 1.0, "grad_norm": 0.1982421875, "kl": 0.037422185007017106, "learning_rate": 1.9409102137610727e-05, "loss": 0.0015, "num_tokens": 17958351.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.3984504703929164, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.07626939087640494, "learning_rate": 1.9408011233178756e-05, "loss": 0.003, "num_tokens": 17969920.0, "reward": 1.2633495330810547, "reward_std": 0.0034325113520026207, "rewards/fixed_code_pass_all_test_reward/mean": 0.2633495330810547, "rewards/fixed_code_pass_all_test_reward/std": 0.0034325553569942713, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.39863493820328355, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.11290344595909119, "learning_rate": 1.9406919353386747e-05, "loss": 0.0045, "num_tokens": 17976100.0, "reward": 1.783088207244873, "reward_std": 0.32786908745765686, "rewards/fixed_code_pass_all_test_reward/mean": 0.783088207244873, "rewards/fixed_code_pass_all_test_reward/std": 0.32786908745765686, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.3988194060136506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.03554335323860869, "learning_rate": 1.94058264983479e-05, "loss": 0.0014, "num_tokens": 17983079.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 458.375, "completions/mean_terminated_length": 458.375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.3990038738240177, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.059270055731758475, "learning_rate": 1.9404732668175505e-05, "loss": 0.0024, "num_tokens": 17997618.0, "reward": 1.4038461446762085, "reward_std": 0.4307495951652527, "rewards/fixed_code_pass_all_test_reward/mean": 0.4038461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.4307496249675751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.3991883416343848, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.042807078221812844, "learning_rate": 1.940363786298297e-05, "loss": 0.0017, "num_tokens": 18002897.0, "reward": 1.375, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3993728094447519, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.03869446483440697, "learning_rate": 1.94025420828838e-05, "loss": 0.0015, "num_tokens": 18007535.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.39955727725511897, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.07656186539679766, "learning_rate": 1.940144532799159e-05, "loss": 0.0031, "num_tokens": 18011567.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3997417450654861, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.05544002237729728, "learning_rate": 1.9400347598420056e-05, "loss": 0.0022, "num_tokens": 18019529.0, "reward": 1.9047619104385376, "reward_std": 0.2693740129470825, "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.2693740129470825, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.39992621287585317, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.05693213013000786, "learning_rate": 1.9399248894282993e-05, "loss": 0.0023, "num_tokens": 18023780.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.40011068068622024, "frac_reward_zero_std": 1.0, "grad_norm": 0.271484375, "kl": 0.10353199951350689, "learning_rate": 1.9398149215694308e-05, "loss": 0.0041, "num_tokens": 18027636.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.40029514849658737, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.06699408125132322, "learning_rate": 1.9397048562768015e-05, "loss": 0.0027, "num_tokens": 18036222.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.40047961630695444, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.03513389895670116, "learning_rate": 1.939594693561822e-05, "loss": 0.0014, "num_tokens": 18046370.0, "reward": 1.399999976158142, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.4006640841173215, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.09294602856971323, "learning_rate": 1.9394844334359124e-05, "loss": 0.0037, "num_tokens": 18054430.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.40084855192768865, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.06675314856693149, "learning_rate": 1.939374075910505e-05, "loss": 0.0027, "num_tokens": 18062030.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4010330197380557, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.09125767834484577, "learning_rate": 1.93926362099704e-05, "loss": 0.0037, "num_tokens": 18071312.0, "reward": 1.191176414489746, "reward_std": 0.4950321614742279, "rewards/fixed_code_pass_all_test_reward/mean": 0.31617647409439087, "rewards/fixed_code_pass_all_test_reward/std": 0.1723969727754593, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.4012174875484228, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.03939844435080886, "learning_rate": 1.9391530687069692e-05, "loss": 0.0016, "num_tokens": 18079908.0, "reward": 1.899999976158142, "reward_std": 0.10690455138683319, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.10690449178218842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.40140195535878986, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.057252559112384915, "learning_rate": 1.9390424190517536e-05, "loss": 0.0023, "num_tokens": 18090547.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.401586423169157, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.031221885699778795, "learning_rate": 1.938931672042865e-05, "loss": 0.0012, "num_tokens": 18094194.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.40177089097952406, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0883671403862536, "learning_rate": 1.9388208276917842e-05, "loss": 0.0035, "num_tokens": 18103119.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.40195535878989114, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.06271557370200753, "learning_rate": 1.9387098860100037e-05, "loss": 0.0025, "num_tokens": 18112074.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.40213982660025827, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.05180928623303771, "learning_rate": 1.9385988470090242e-05, "loss": 0.0021, "num_tokens": 18121036.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1511.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 742.0, "completions/mean_terminated_length": 742.0, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.40232429441062534, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.0375310453819111, "learning_rate": 1.9384877107003587e-05, "loss": 0.0015, "num_tokens": 18140380.0, "reward": 1.3125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4025087622209924, "frac_reward_zero_std": 1.0, "grad_norm": 0.37890625, "kl": 0.15197537187486887, "learning_rate": 1.938376477095528e-05, "loss": 0.0061, "num_tokens": 18144886.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 448.0, "completions/mean_terminated_length": 448.0, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.40269323003135954, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.049643858103081584, "learning_rate": 1.9382651462060643e-05, "loss": 0.002, "num_tokens": 18156390.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4028776978417266, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.10876232432201505, "learning_rate": 1.9381537180435103e-05, "loss": 0.0044, "num_tokens": 18164603.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4030621656520937, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.04911126522347331, "learning_rate": 1.9380421926194172e-05, "loss": 0.002, "num_tokens": 18173818.0, "reward": 1.5401785373687744, "reward_std": 0.6934295296669006, "rewards/fixed_code_pass_all_test_reward/mean": 0.6651785969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.40717756748199463, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4032466334624608, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.036480508744716644, "learning_rate": 1.9379305699453478e-05, "loss": 0.0015, "num_tokens": 18180454.0, "reward": 1.951923131942749, "reward_std": 0.13598209619522095, "rewards/fixed_code_pass_all_test_reward/mean": 0.9519230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.13598206639289856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 402.25, "completions/mean_terminated_length": 402.25, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.4034311012728279, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.04836196266114712, "learning_rate": 1.9378188500328746e-05, "loss": 0.0019, "num_tokens": 18188608.0, "reward": 1.8888888359069824, "reward_std": 0.11878276616334915, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.11878276616334915, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.40361556908319496, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.061212467262521386, "learning_rate": 1.937707032893579e-05, "loss": 0.0024, "num_tokens": 18197615.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4038000368935621, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.048279145965352654, "learning_rate": 1.9375951185390543e-05, "loss": 0.0019, "num_tokens": 18204115.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.40398450470392916, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.07347853295505047, "learning_rate": 1.937483106980903e-05, "loss": 0.0029, "num_tokens": 18211536.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.40416897251429623, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05301134707406163, "learning_rate": 1.9373709982307377e-05, "loss": 0.0021, "num_tokens": 18222008.0, "reward": 1.9229323863983154, "reward_std": 0.14270161092281342, "rewards/fixed_code_pass_all_test_reward/mean": 0.9229323267936707, "rewards/fixed_code_pass_all_test_reward/std": 0.14270161092281342, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.40435344032466336, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.02576239500194788, "learning_rate": 1.9372587923001807e-05, "loss": 0.001, "num_tokens": 18228930.0, "reward": 1.3214285373687744, "reward_std": 0.4644818902015686, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.03818017616868019, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.40453790813503043, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.06120583042502403, "learning_rate": 1.937146489200865e-05, "loss": 0.0024, "num_tokens": 18238143.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4047223759453975, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.05549574992619455, "learning_rate": 1.9370340889444333e-05, "loss": 0.0022, "num_tokens": 18248472.0, "reward": 1.7828947305679321, "reward_std": 0.40810689330101013, "rewards/fixed_code_pass_all_test_reward/mean": 0.7828947305679321, "rewards/fixed_code_pass_all_test_reward/std": 0.4081069231033325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.40490684375576463, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.06633026199415326, "learning_rate": 1.9369215915425388e-05, "loss": 0.0027, "num_tokens": 18252825.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.4050913115661317, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.06826285785064101, "learning_rate": 1.9368089970068442e-05, "loss": 0.0027, "num_tokens": 18261436.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4052757793764988, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.04750300641171634, "learning_rate": 1.9366963053490227e-05, "loss": 0.0019, "num_tokens": 18265878.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.4054602471868659, "frac_reward_zero_std": 1.0, "grad_norm": 0.04345703125, "kl": 0.028336454997770488, "learning_rate": 1.9365835165807576e-05, "loss": 0.0011, "num_tokens": 18272636.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.405644714997233, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.0360101864207536, "learning_rate": 1.9364706307137417e-05, "loss": 0.0014, "num_tokens": 18276735.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.40582918280760005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.04845994687639177, "learning_rate": 1.9363576477596786e-05, "loss": 0.0019, "num_tokens": 18286206.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4060136506179672, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.06846655998378992, "learning_rate": 1.936244567730281e-05, "loss": 0.0027, "num_tokens": 18295550.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.40619811842833425, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.047910166904330254, "learning_rate": 1.9361313906372734e-05, "loss": 0.0019, "num_tokens": 18305828.0, "reward": 1.9577702283859253, "reward_std": 0.08025240153074265, "rewards/fixed_code_pass_all_test_reward/mean": 0.9577702879905701, "rewards/fixed_code_pass_all_test_reward/std": 0.08025235682725906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.4063825862387013, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.05310535826720297, "learning_rate": 1.936018116492388e-05, "loss": 0.0021, "num_tokens": 18317386.0, "reward": 1.478124976158142, "reward_std": 0.25405198335647583, "rewards/fixed_code_pass_all_test_reward/mean": 0.4781250059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.25405198335647583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.40656705404906845, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.06808144506067038, "learning_rate": 1.9359047453073696e-05, "loss": 0.0027, "num_tokens": 18329022.0, "reward": 1.2989130020141602, "reward_std": 0.4338110387325287, "rewards/fixed_code_pass_all_test_reward/mean": 0.29891303181648254, "rewards/fixed_code_pass_all_test_reward/std": 0.4338110685348511, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 356.0, "completions/mean_terminated_length": 356.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4067515218594355, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0661726831458509, "learning_rate": 1.9357912770939708e-05, "loss": 0.0026, "num_tokens": 18338350.0, "reward": 1.2314815521240234, "reward_std": 0.5052148103713989, "rewards/fixed_code_pass_all_test_reward/mean": 0.35648149251937866, "rewards/fixed_code_pass_all_test_reward/std": 0.16849380731582642, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 379.0, "completions/mean_terminated_length": 379.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4069359896698026, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.05881792004220188, "learning_rate": 1.9356777118639552e-05, "loss": 0.0024, "num_tokens": 18350054.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.40712045748016973, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.03625512705184519, "learning_rate": 1.9355640496290967e-05, "loss": 0.0015, "num_tokens": 18359567.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.4073049252905368, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.058654078748077154, "learning_rate": 1.9354502904011794e-05, "loss": 0.0023, "num_tokens": 18371315.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.4074893931009039, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.02990041926386766, "learning_rate": 1.9353364341919972e-05, "loss": 0.0012, "num_tokens": 18384418.0, "reward": 1.9752066135406494, "reward_std": 0.057935524731874466, "rewards/fixed_code_pass_all_test_reward/mean": 0.9752066135406494, "rewards/fixed_code_pass_all_test_reward/std": 0.05793552100658417, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.407673860911271, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.0631833306979388, "learning_rate": 1.9352224810133532e-05, "loss": 0.0025, "num_tokens": 18394069.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.4078583287216381, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.06377551006153226, "learning_rate": 1.935108430877062e-05, "loss": 0.0026, "num_tokens": 18398457.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.40804279653200515, "frac_reward_zero_std": 1.0, "grad_norm": 0.337890625, "kl": 0.07804432092234492, "learning_rate": 1.934994283794947e-05, "loss": 0.0031, "num_tokens": 18407391.0, "reward": 1.8947367668151855, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8947368264198303, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 448.75, "completions/mean_terminated_length": 448.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.4082272643423723, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.041587830521166325, "learning_rate": 1.9348800397788424e-05, "loss": 0.0017, "num_tokens": 18416853.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.40841173215273935, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.03245314140804112, "learning_rate": 1.9347656988405925e-05, "loss": 0.0013, "num_tokens": 18426539.0, "reward": 1.9888060092926025, "reward_std": 0.020727353170514107, "rewards/fixed_code_pass_all_test_reward/mean": 0.9888059496879578, "rewards/fixed_code_pass_all_test_reward/std": 0.020727327093482018, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4085961999631064, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.06698237918317318, "learning_rate": 1.9346512609920515e-05, "loss": 0.0027, "num_tokens": 18434127.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.40878066777347355, "frac_reward_zero_std": 0.0, "grad_norm": 0.63671875, "kl": 0.02205577283166349, "learning_rate": 1.9345367262450827e-05, "loss": 0.0009, "num_tokens": 18447488.0, "reward": 1.5340908765792847, "reward_std": 0.44915637373924255, "rewards/fixed_code_pass_all_test_reward/mean": 0.6590908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.21041364967823029, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 754.375, "completions/mean_terminated_length": 323.16668701171875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4089651355838406, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.06076545245014131, "learning_rate": 1.934422094611561e-05, "loss": 0.0024, "num_tokens": 18457299.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4091496033942077, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.06589684984646738, "learning_rate": 1.9343073661033708e-05, "loss": 0.0026, "num_tokens": 18466626.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 364.75, "completions/mean_terminated_length": 364.75, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.4093340712045748, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.05481755780056119, "learning_rate": 1.9341925407324064e-05, "loss": 0.0022, "num_tokens": 18473720.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 396.0, "completions/mean_terminated_length": 396.0, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.4095185390149419, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.09364976780489087, "learning_rate": 1.9340776185105712e-05, "loss": 0.0037, "num_tokens": 18484360.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 433.25, "completions/mean_terminated_length": 433.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.40970300682530897, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06644648895598948, "learning_rate": 1.9339625994497808e-05, "loss": 0.0027, "num_tokens": 18495594.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.4098874746356761, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.04905774537473917, "learning_rate": 1.933847483561959e-05, "loss": 0.002, "num_tokens": 18499634.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.41007194244604317, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.04332609195262194, "learning_rate": 1.9337322708590398e-05, "loss": 0.0017, "num_tokens": 18509373.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.41025641025641024, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.03939898288808763, "learning_rate": 1.9336169613529686e-05, "loss": 0.0016, "num_tokens": 18513780.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 360.125, "completions/mean_terminated_length": 360.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.41044087806677737, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.02126741223037243, "learning_rate": 1.9335015550556994e-05, "loss": 0.0009, "num_tokens": 18520101.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.41062534587714444, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.02886657346971333, "learning_rate": 1.9333860519791968e-05, "loss": 0.0012, "num_tokens": 18525281.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 570.125, "completions/mean_terminated_length": 570.125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.4108098136875115, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.053466693265363574, "learning_rate": 1.9332704521354356e-05, "loss": 0.0021, "num_tokens": 18539554.0, "reward": 1.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 491.375, "completions/mean_terminated_length": 491.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.41099428149787864, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.062037328723818064, "learning_rate": 1.9331547555364e-05, "loss": 0.0025, "num_tokens": 18550101.0, "reward": 1.65625, "reward_std": 0.48065245151519775, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.4111787493082457, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.07843072223477066, "learning_rate": 1.9330389621940854e-05, "loss": 0.0031, "num_tokens": 18557197.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 995.75, "completions/mean_terminated_length": 364.3999938964844, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.4113632171186128, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.04001449604402296, "learning_rate": 1.9329230721204956e-05, "loss": 0.0016, "num_tokens": 18569371.0, "reward": 0.9204545617103577, "reward_std": 0.8520237803459167, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.45259320735931396, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4115476849289799, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.04664748511277139, "learning_rate": 1.9328070853276458e-05, "loss": 0.0019, "num_tokens": 18574225.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.411732152739347, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.039697568863630295, "learning_rate": 1.932691001827561e-05, "loss": 0.0016, "num_tokens": 18579094.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 554.125, "completions/mean_terminated_length": 554.125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.41191662054971406, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.042685813969001174, "learning_rate": 1.932574821632275e-05, "loss": 0.0017, "num_tokens": 18594199.0, "reward": 1.875, "reward_std": 0.3162689208984375, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3162688910961151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4121010883600812, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.03532982035540044, "learning_rate": 1.932458544753834e-05, "loss": 0.0014, "num_tokens": 18599080.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 384.875, "completions/mean_terminated_length": 384.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.41228555617044826, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.1344591446686536, "learning_rate": 1.9323421712042915e-05, "loss": 0.0054, "num_tokens": 18608015.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.41247002398081534, "frac_reward_zero_std": 1.0, "grad_norm": 0.322265625, "kl": 0.06811526301316917, "learning_rate": 1.9322257009957132e-05, "loss": 0.0027, "num_tokens": 18613011.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 786.625, "completions/mean_terminated_length": 606.4285888671875, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.41265449179118247, "frac_reward_zero_std": 0.0, "grad_norm": 0.369140625, "kl": 0.046631124801933765, "learning_rate": 1.932109134140173e-05, "loss": 0.0019, "num_tokens": 18625080.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.41283895960154954, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.07852643867954612, "learning_rate": 1.9319924706497567e-05, "loss": 0.0031, "num_tokens": 18636387.0, "reward": 1.357954502105713, "reward_std": 0.14463543891906738, "rewards/fixed_code_pass_all_test_reward/mean": 0.35795456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.14463548362255096, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 389.75, "completions/mean_terminated_length": 389.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4130234274119166, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.04115585068939254, "learning_rate": 1.9318757105365594e-05, "loss": 0.0016, "num_tokens": 18646649.0, "reward": 1.324013113975525, "reward_std": 0.009789749048650265, "rewards/fixed_code_pass_all_test_reward/mean": 0.3240131735801697, "rewards/fixed_code_pass_all_test_reward/std": 0.009789785370230675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 625.25, "completions/mean_terminated_length": 422.0000305175781, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.41320789522228374, "frac_reward_zero_std": 0.0, "grad_norm": 0.62890625, "kl": 0.0555822413880378, "learning_rate": 1.9317588538126852e-05, "loss": 0.0022, "num_tokens": 18657763.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4133923630326508, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.026169362594373524, "learning_rate": 1.931641900490249e-05, "loss": 0.001, "num_tokens": 18662189.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4135768308430179, "frac_reward_zero_std": 1.0, "grad_norm": 0.259765625, "kl": 0.053370186826214194, "learning_rate": 1.9315248505813763e-05, "loss": 0.0021, "num_tokens": 18671747.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.41376129865338496, "frac_reward_zero_std": 1.0, "grad_norm": 0.048828125, "kl": 0.03429671248886734, "learning_rate": 1.931407704098202e-05, "loss": 0.0014, "num_tokens": 18678518.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 374.125, "completions/mean_terminated_length": 374.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.4139457664637521, "frac_reward_zero_std": 1.0, "grad_norm": 0.3046875, "kl": 0.10347723960876465, "learning_rate": 1.9312904610528708e-05, "loss": 0.0041, "num_tokens": 18687447.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 351.0, "completions/mean_terminated_length": 351.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.41413023427411916, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.02256230456987396, "learning_rate": 1.931173121457538e-05, "loss": 0.0009, "num_tokens": 18694855.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 710.75, "completions/mean_terminated_length": 519.7142944335938, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.41431470208448623, "frac_reward_zero_std": 0.0, "grad_norm": 0.37109375, "kl": 0.052067259210161865, "learning_rate": 1.931055685324368e-05, "loss": 0.0021, "num_tokens": 18709589.0, "reward": 1.0967742204666138, "reward_std": 0.4617042541503906, "rewards/fixed_code_pass_all_test_reward/mean": 0.22177419066429138, "rewards/fixed_code_pass_all_test_reward/std": 0.15750157833099365, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 554.875, "completions/mean_terminated_length": 554.875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.41449916989485336, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.042478944407776, "learning_rate": 1.9309381526655362e-05, "loss": 0.0017, "num_tokens": 18724412.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.41468363770522043, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.032415792578831315, "learning_rate": 1.930820523493228e-05, "loss": 0.0013, "num_tokens": 18734497.0, "reward": 1.5, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 541.125, "completions/mean_terminated_length": 541.125, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.4148681055155875, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.051177745684981346, "learning_rate": 1.9307027978196376e-05, "loss": 0.002, "num_tokens": 18743546.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 350.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.41505257332595463, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.03792436153162271, "learning_rate": 1.9305849756569705e-05, "loss": 0.0015, "num_tokens": 18750290.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 516.375, "completions/mean_terminated_length": 516.375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.4152370411363217, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.06106442678719759, "learning_rate": 1.9304670570174414e-05, "loss": 0.0024, "num_tokens": 18763005.0, "reward": 1.8333332538604736, "reward_std": 0.25814807415008545, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.25814807415008545, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4154215089466888, "frac_reward_zero_std": 1.0, "grad_norm": 0.20703125, "kl": 0.06257821898907423, "learning_rate": 1.9303490419132758e-05, "loss": 0.0025, "num_tokens": 18770564.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4156059767570559, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.060645506251603365, "learning_rate": 1.9302309303567085e-05, "loss": 0.0024, "num_tokens": 18776712.0, "reward": 1.1145832538604736, "reward_std": 0.062001921236515045, "rewards/fixed_code_pass_all_test_reward/mean": 0.1145833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.06200198456645012, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 581.0, "completions/mean_terminated_length": 581.0, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.415790444567423, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.04547513881698251, "learning_rate": 1.9301127223599843e-05, "loss": 0.0018, "num_tokens": 18787792.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.41597491237779005, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.06013173097744584, "learning_rate": 1.9299944179353587e-05, "loss": 0.0024, "num_tokens": 18797772.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4161593801881572, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.027040946180932224, "learning_rate": 1.9298760170950964e-05, "loss": 0.0011, "num_tokens": 18803700.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.41634384799852425, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.04632193874567747, "learning_rate": 1.929757519851472e-05, "loss": 0.0019, "num_tokens": 18811937.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4165283158088913, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.05699801770970225, "learning_rate": 1.9296389262167712e-05, "loss": 0.0023, "num_tokens": 18820512.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 375.75, "completions/mean_terminated_length": 375.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.41671278361925845, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.051666841842234135, "learning_rate": 1.9295202362032886e-05, "loss": 0.0021, "num_tokens": 18827862.0, "reward": 1.5, "reward_std": 0.3149183392524719, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.3149183392524719, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4168972514296255, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.051860887790098786, "learning_rate": 1.9294014498233294e-05, "loss": 0.0021, "num_tokens": 18832271.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4170817192399926, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.10116012068465352, "learning_rate": 1.9292825670892088e-05, "loss": 0.004, "num_tokens": 18836211.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 368.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4172661870503597, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.05369422060903162, "learning_rate": 1.929163588013251e-05, "loss": 0.0021, "num_tokens": 18844851.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 425.0, "completions/mean_terminated_length": 425.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4174506548607268, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.037908138474449515, "learning_rate": 1.9290445126077917e-05, "loss": 0.0015, "num_tokens": 18869059.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 446.375, "completions/mean_terminated_length": 446.375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.4176351226710939, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.041092017432674766, "learning_rate": 1.9289253408851758e-05, "loss": 0.0016, "num_tokens": 18877262.0, "reward": 1.6578947305679321, "reward_std": 0.10895779728889465, "rewards/fixed_code_pass_all_test_reward/mean": 0.6578947305679321, "rewards/fixed_code_pass_all_test_reward/std": 0.10895773023366928, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.417819590481461, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.07798714423552155, "learning_rate": 1.9288060728577575e-05, "loss": 0.0031, "num_tokens": 18885402.0, "reward": 1.7321429252624512, "reward_std": 0.3128393590450287, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3128393292427063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 341.625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4180040582918281, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.03691658528987318, "learning_rate": 1.9286867085379027e-05, "loss": 0.0015, "num_tokens": 18895287.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 436.0, "completions/mean_terminated_length": 205.71429443359375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.41818852610219515, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.016175875207409263, "learning_rate": 1.9285672479379856e-05, "loss": 0.0006, "num_tokens": 18902263.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4183729939125623, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.05496821540873498, "learning_rate": 1.928447691070391e-05, "loss": 0.0022, "num_tokens": 18910514.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 392.0, "completions/mean_terminated_length": 392.0, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.41855746172292935, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.02638878033030778, "learning_rate": 1.9283280379475145e-05, "loss": 0.0011, "num_tokens": 18918698.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4187419295332964, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.029873751453123987, "learning_rate": 1.9282082885817607e-05, "loss": 0.0012, "num_tokens": 18922646.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.41892639734366355, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.057515296153724194, "learning_rate": 1.9280884429855438e-05, "loss": 0.0023, "num_tokens": 18931840.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.4191108651540306, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.15789741277694702, "learning_rate": 1.9279685011712894e-05, "loss": 0.0063, "num_tokens": 18936404.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.4192953329643977, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.058952752500772476, "learning_rate": 1.9278484631514316e-05, "loss": 0.0024, "num_tokens": 18945955.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4194798007747648, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03770574275404215, "learning_rate": 1.9277283289384154e-05, "loss": 0.0015, "num_tokens": 18953802.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4196642685851319, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.08242155937477946, "learning_rate": 1.927608098544696e-05, "loss": 0.0033, "num_tokens": 18961775.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.41984873639549897, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.034104855614714324, "learning_rate": 1.9274877719827373e-05, "loss": 0.0014, "num_tokens": 18971525.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 438.5, "completions/mean_terminated_length": 438.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.4200332042058661, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.04342841892503202, "learning_rate": 1.927367349265014e-05, "loss": 0.0017, "num_tokens": 18979841.0, "reward": 1.6136362552642822, "reward_std": 0.0420827642083168, "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.04208271950483322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 452.625, "completions/mean_terminated_length": 452.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.42021767201623317, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.05553004937246442, "learning_rate": 1.9272468304040116e-05, "loss": 0.0022, "num_tokens": 18991678.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.42040213982660024, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.042978229466825724, "learning_rate": 1.9271262154122238e-05, "loss": 0.0017, "num_tokens": 19001415.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 690.75, "completions/mean_terminated_length": 690.75, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.42058660763696737, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578125, "kl": 0.03954421426169574, "learning_rate": 1.9270055043021556e-05, "loss": 0.0016, "num_tokens": 19015509.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 378.25, "completions/mean_terminated_length": 378.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.42077107544733444, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.061089978320524096, "learning_rate": 1.926884697086321e-05, "loss": 0.0024, "num_tokens": 19024119.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4209555432577015, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.03823255992028862, "learning_rate": 1.9267637937772456e-05, "loss": 0.0015, "num_tokens": 19029932.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.42114001106806864, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.05146806943230331, "learning_rate": 1.9266427943874627e-05, "loss": 0.0021, "num_tokens": 19039257.0, "reward": 1.58695650100708, "reward_std": 0.2902688980102539, "rewards/fixed_code_pass_all_test_reward/mean": 0.5869565010070801, "rewards/fixed_code_pass_all_test_reward/std": 0.2902688682079315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4213244788784357, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.055736158741638064, "learning_rate": 1.9265216989295174e-05, "loss": 0.0022, "num_tokens": 19044959.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 410.0, "completions/mean_terminated_length": 410.0, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.4215089466888028, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.05023083230480552, "learning_rate": 1.9264005074159633e-05, "loss": 0.002, "num_tokens": 19053023.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.4216934144991699, "frac_reward_zero_std": 1.0, "grad_norm": 0.28515625, "kl": 0.05534060252830386, "learning_rate": 1.9262792198593657e-05, "loss": 0.0022, "num_tokens": 19057032.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.421877882309537, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.04976737545803189, "learning_rate": 1.9261578362722986e-05, "loss": 0.002, "num_tokens": 19065185.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 304.75, "completions/mean_terminated_length": 304.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.42206235011990406, "frac_reward_zero_std": 0.0, "grad_norm": 0.68359375, "kl": 0.02806751464959234, "learning_rate": 1.926036356667346e-05, "loss": 0.0011, "num_tokens": 19073151.0, "reward": 1.9358108043670654, "reward_std": 0.18155446648597717, "rewards/fixed_code_pass_all_test_reward/mean": 0.9358108043670654, "rewards/fixed_code_pass_all_test_reward/std": 0.18155445158481598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 479.75, "completions/mean_terminated_length": 479.75, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.4222468179302712, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.03395665902644396, "learning_rate": 1.925914781057102e-05, "loss": 0.0014, "num_tokens": 19082581.0, "reward": 1.7083333730697632, "reward_std": 0.14337210357189178, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1433720886707306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.42243128574063826, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.07398072304204106, "learning_rate": 1.925793109454171e-05, "loss": 0.003, "num_tokens": 19086358.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.42261575355100534, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.05742534273304045, "learning_rate": 1.925671341871167e-05, "loss": 0.0023, "num_tokens": 19092772.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.42280022136137246, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.044009767938405275, "learning_rate": 1.925549478320714e-05, "loss": 0.0018, "num_tokens": 19099301.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.42298468917173954, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.03462363255675882, "learning_rate": 1.9254275188154462e-05, "loss": 0.0014, "num_tokens": 19107627.0, "reward": 1.7237085103988647, "reward_std": 0.28233999013900757, "rewards/fixed_code_pass_all_test_reward/mean": 0.7237085103988647, "rewards/fixed_code_pass_all_test_reward/std": 0.2823399305343628, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4231691569821066, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.06847275653854012, "learning_rate": 1.9253054633680074e-05, "loss": 0.0027, "num_tokens": 19111860.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 441.0, "completions/mean_terminated_length": 441.0, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.42335362479247374, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.019391046196687967, "learning_rate": 1.925183311991052e-05, "loss": 0.0008, "num_tokens": 19120388.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.4235380926028408, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.01970634318422526, "learning_rate": 1.9250610646972428e-05, "loss": 0.0008, "num_tokens": 19126856.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 95.5, "completions/mean_terminated_length": 95.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.4237225604132079, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.1823858292773366, "learning_rate": 1.9249387214992544e-05, "loss": 0.0073, "num_tokens": 19130308.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.423907028223575, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.05038139899261296, "learning_rate": 1.9248162824097703e-05, "loss": 0.002, "num_tokens": 19140788.0, "reward": 1.0714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4240914960339421, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.0333156727720052, "learning_rate": 1.924693747441484e-05, "loss": 0.0013, "num_tokens": 19146172.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 254.0, "completions/mean_terminated_length": 254.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.42427596384430916, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.08257703110575676, "learning_rate": 1.9245711166070995e-05, "loss": 0.0033, "num_tokens": 19152068.0, "reward": 1.8636363744735718, "reward_std": 0.2524963617324829, "rewards/fixed_code_pass_all_test_reward/mean": 0.8636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.2524963915348053, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 868.75, "completions/mean_terminated_length": 475.66668701171875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.4244604316546763, "frac_reward_zero_std": 0.0, "grad_norm": 0.41796875, "kl": 0.01572364807361737, "learning_rate": 1.92444838991933e-05, "loss": 0.0006, "num_tokens": 19164858.0, "reward": 1.2604167461395264, "reward_std": 0.784428060054779, "rewards/fixed_code_pass_all_test_reward/mean": 0.5104166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3307189345359802, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.42464489946504336, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.07347368029877543, "learning_rate": 1.9243255673908994e-05, "loss": 0.0029, "num_tokens": 19169812.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 465.75, "completions/mean_terminated_length": 465.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.42482936727541043, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.03824932366842404, "learning_rate": 1.9242026490345406e-05, "loss": 0.0015, "num_tokens": 19180546.0, "reward": 1.1666665077209473, "reward_std": 0.03636966645717621, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.036369647830724716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.42501383508577756, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.07914357958361506, "learning_rate": 1.9240796348629972e-05, "loss": 0.0032, "num_tokens": 19186810.0, "reward": 1.7999999523162842, "reward_std": 0.38544961810112, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.42519830289614463, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.03608485125005245, "learning_rate": 1.9239565248890225e-05, "loss": 0.0014, "num_tokens": 19195534.0, "reward": 1.524999976158142, "reward_std": 0.22730305790901184, "rewards/fixed_code_pass_all_test_reward/mean": 0.5249999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.22730302810668945, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 111.75, "completions/mean_terminated_length": 111.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4253827707065117, "frac_reward_zero_std": 1.0, "grad_norm": 0.2333984375, "kl": 0.08649184228852391, "learning_rate": 1.9238333191253797e-05, "loss": 0.0035, "num_tokens": 19199244.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 580.375, "completions/mean_terminated_length": 580.375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.42556723851687883, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.035986074479296803, "learning_rate": 1.923710017584842e-05, "loss": 0.0014, "num_tokens": 19214975.0, "reward": 1.0299999713897705, "reward_std": 0.08485280722379684, "rewards/fixed_code_pass_all_test_reward/mean": 0.029999999329447746, "rewards/fixed_code_pass_all_test_reward/std": 0.08485281467437744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4257517063272459, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.05489029781892896, "learning_rate": 1.9235866202801924e-05, "loss": 0.0022, "num_tokens": 19223380.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.425936174137613, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.08160588308237493, "learning_rate": 1.9234631272242243e-05, "loss": 0.0033, "num_tokens": 19234676.0, "reward": 1.8858695030212402, "reward_std": 0.3228096067905426, "rewards/fixed_code_pass_all_test_reward/mean": 0.885869562625885, "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.42612064194798005, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04796403902582824, "learning_rate": 1.92333953842974e-05, "loss": 0.0019, "num_tokens": 19241908.0, "reward": 1.4464285373687744, "reward_std": 0.3926251530647278, "rewards/fixed_code_pass_all_test_reward/mean": 0.4464285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.39262518286705017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 454.875, "completions/mean_terminated_length": 454.875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.4263051097583472, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.014643264235928655, "learning_rate": 1.9232158539095526e-05, "loss": 0.0006, "num_tokens": 19253283.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.42648957756871425, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.07480897568166256, "learning_rate": 1.923092073676485e-05, "loss": 0.003, "num_tokens": 19260239.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4266740453790813, "frac_reward_zero_std": 0.0, "grad_norm": 5.1875, "kl": 0.06601953622885048, "learning_rate": 1.92296819774337e-05, "loss": 0.0026, "num_tokens": 19264959.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.42685851318944845, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.05821204977110028, "learning_rate": 1.9228442261230503e-05, "loss": 0.0023, "num_tokens": 19273735.0, "reward": 1.1966667175292969, "reward_std": 0.04320497065782547, "rewards/fixed_code_pass_all_test_reward/mean": 0.1966666579246521, "rewards/fixed_code_pass_all_test_reward/std": 0.04320494830608368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 460.625, "completions/mean_terminated_length": 460.625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.4270429809998155, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.07222754089161754, "learning_rate": 1.9227201588283776e-05, "loss": 0.0029, "num_tokens": 19282044.0, "reward": 1.9302325248718262, "reward_std": 0.17884741723537445, "rewards/fixed_code_pass_all_test_reward/mean": 0.9302325248718262, "rewards/fixed_code_pass_all_test_reward/std": 0.17884741723537445, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 336.75, "completions/mean_terminated_length": 336.75, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4272274488101826, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.04622276150621474, "learning_rate": 1.9225959958722156e-05, "loss": 0.0018, "num_tokens": 19291714.0, "reward": 1.8026316165924072, "reward_std": 0.2893027663230896, "rewards/fixed_code_pass_all_test_reward/mean": 0.8026316165924072, "rewards/fixed_code_pass_all_test_reward/std": 0.2893027663230896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4274119166205497, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.04531836276873946, "learning_rate": 1.922471737267436e-05, "loss": 0.0018, "num_tokens": 19298159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 449.375, "completions/mean_terminated_length": 449.375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.4275963844309168, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.035482348408550024, "learning_rate": 1.922347383026921e-05, "loss": 0.0014, "num_tokens": 19307298.0, "reward": 1.2395832538604736, "reward_std": 0.029462814331054688, "rewards/fixed_code_pass_all_test_reward/mean": 0.2395833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.029462780803442, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 412.125, "completions/mean_terminated_length": 412.125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.42778085224128387, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07914287154562771, "learning_rate": 1.922222933163563e-05, "loss": 0.0032, "num_tokens": 19315259.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.427965320051651, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.060592830181121826, "learning_rate": 1.9220983876902647e-05, "loss": 0.0024, "num_tokens": 19319463.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 468.125, "completions/mean_terminated_length": 468.125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.42814978786201807, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.03913731896318495, "learning_rate": 1.921973746619937e-05, "loss": 0.0016, "num_tokens": 19327928.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.42833425567238514, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.09962846525013447, "learning_rate": 1.921849009965503e-05, "loss": 0.004, "num_tokens": 19334262.0, "reward": 1.0178570747375488, "reward_std": 0.05050762742757797, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 594.5, "completions/mean_terminated_length": 594.5, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.4285187234827523, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.036094429437071085, "learning_rate": 1.921724177739894e-05, "loss": 0.0014, "num_tokens": 19344754.0, "reward": 1.8060344457626343, "reward_std": 0.25402316451072693, "rewards/fixed_code_pass_all_test_reward/mean": 0.8060344457626343, "rewards/fixed_code_pass_all_test_reward/std": 0.25402316451072693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 451.875, "completions/mean_terminated_length": 451.875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.42870319129311935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.0469690237659961, "learning_rate": 1.9215992499560515e-05, "loss": 0.0019, "num_tokens": 19354537.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 413.125, "completions/mean_terminated_length": 413.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4288876591034864, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.043040668591856956, "learning_rate": 1.9214742266269275e-05, "loss": 0.0017, "num_tokens": 19364002.0, "reward": 1.359375, "reward_std": 0.4470679461956024, "rewards/fixed_code_pass_all_test_reward/mean": 0.359375, "rewards/fixed_code_pass_all_test_reward/std": 0.4470680058002472, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 346.75, "completions/mean_terminated_length": 346.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.42907212691385355, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.05405280482955277, "learning_rate": 1.921349107765484e-05, "loss": 0.0022, "num_tokens": 19371440.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4292565947242206, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.053432710003107786, "learning_rate": 1.9212238933846915e-05, "loss": 0.0021, "num_tokens": 19382979.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4294410625345877, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.051974230678752065, "learning_rate": 1.9210985834975323e-05, "loss": 0.0021, "num_tokens": 19390820.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.4296255303449548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0361328125, "kl": 0.022210419527255, "learning_rate": 1.9209731781169974e-05, "loss": 0.0009, "num_tokens": 19400085.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.4298099981553219, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.06940677133388817, "learning_rate": 1.920847677256088e-05, "loss": 0.0028, "num_tokens": 19408927.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 768.875, "completions/mean_terminated_length": 586.1428833007812, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.42999446596568897, "frac_reward_zero_std": 0.0, "grad_norm": 0.50390625, "kl": 0.02712810243247077, "learning_rate": 1.9207220809278154e-05, "loss": 0.0011, "num_tokens": 19423158.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4301789337760561, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.0771363670937717, "learning_rate": 1.9205963891452e-05, "loss": 0.0031, "num_tokens": 19429026.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 645.5, "completions/mean_terminated_length": 645.5, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.43036340158642317, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.045490658143535256, "learning_rate": 1.920470601921273e-05, "loss": 0.0018, "num_tokens": 19439534.0, "reward": 1.3833333253860474, "reward_std": 0.24364951252937317, "rewards/fixed_code_pass_all_test_reward/mean": 0.38333332538604736, "rewards/fixed_code_pass_all_test_reward/std": 0.24364949762821198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 385.375, "completions/mean_terminated_length": 385.375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.43054786939679024, "frac_reward_zero_std": 1.0, "grad_norm": 0.2236328125, "kl": 0.06689278548583388, "learning_rate": 1.9203447192690754e-05, "loss": 0.0027, "num_tokens": 19447081.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 444.0, "completions/mean_terminated_length": 444.0, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.43073233720715737, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.0388527640607208, "learning_rate": 1.9202187412016577e-05, "loss": 0.0016, "num_tokens": 19455641.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.43091680501752444, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.048122330103069544, "learning_rate": 1.9200926677320805e-05, "loss": 0.0019, "num_tokens": 19461361.0, "reward": 1.9464285373687744, "reward_std": 0.1062890887260437, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.10628911107778549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 519.125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.4311012728278915, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.04164264351129532, "learning_rate": 1.919966498873414e-05, "loss": 0.0017, "num_tokens": 19470378.0, "reward": 1.3068181276321411, "reward_std": 0.19998523592948914, "rewards/fixed_code_pass_all_test_reward/mean": 0.3068181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.19998525083065033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.43128574063825864, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.07707827165722847, "learning_rate": 1.919840234638739e-05, "loss": 0.0031, "num_tokens": 19474862.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.4314702084486257, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.04515744931995869, "learning_rate": 1.9197138750411452e-05, "loss": 0.0018, "num_tokens": 19479594.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 716.125, "completions/mean_terminated_length": 525.857177734375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.4316546762589928, "frac_reward_zero_std": 0.0, "grad_norm": 0.7890625, "kl": 0.057672600261867046, "learning_rate": 1.9195874200937336e-05, "loss": 0.0023, "num_tokens": 19491403.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.4318391440693599, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04179658833891153, "learning_rate": 1.919460869809613e-05, "loss": 0.0017, "num_tokens": 19500070.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 472.75, "completions/mean_terminated_length": 472.75, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.432023611879727, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.07956786081194878, "learning_rate": 1.9193342242019044e-05, "loss": 0.0032, "num_tokens": 19508636.0, "reward": 1.5416666269302368, "reward_std": 0.22072911262512207, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.22072911262512207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.43220807969009406, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.06560546555556357, "learning_rate": 1.9192074832837367e-05, "loss": 0.0026, "num_tokens": 19518873.0, "reward": 1.7791666984558105, "reward_std": 0.05892553552985191, "rewards/fixed_code_pass_all_test_reward/mean": 0.7791666984558105, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.4323925475004612, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.047562207211740315, "learning_rate": 1.9190806470682503e-05, "loss": 0.0019, "num_tokens": 19526997.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 409.875, "completions/mean_terminated_length": 409.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.43257701531082826, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.03533302713185549, "learning_rate": 1.9189537155685944e-05, "loss": 0.0014, "num_tokens": 19534244.0, "reward": 1.5, "reward_std": 0.7292091846466064, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.417855441570282, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.43276148312119533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.0498668325599283, "learning_rate": 1.9188266887979287e-05, "loss": 0.002, "num_tokens": 19540757.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 894.875, "completions/mean_terminated_length": 510.5, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.43294595093156246, "frac_reward_zero_std": 0.0, "grad_norm": 0.6484375, "kl": 0.0201860044762725, "learning_rate": 1.9186995667694216e-05, "loss": 0.0008, "num_tokens": 19554044.0, "reward": 1.25, "reward_std": 0.8864052295684814, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 799.75, "completions/mean_terminated_length": 799.75, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.43313041874192953, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.030497669242322445, "learning_rate": 1.918572349496253e-05, "loss": 0.0012, "num_tokens": 19571522.0, "reward": 1.2916667461395264, "reward_std": 0.3063361644744873, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.3063361942768097, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 478.875, "completions/mean_terminated_length": 478.875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.4333148865522966, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.061279186280444264, "learning_rate": 1.9184450369916123e-05, "loss": 0.0025, "num_tokens": 19580401.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 363.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.43349935436266374, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.03851740341633558, "learning_rate": 1.9183176292686974e-05, "loss": 0.0015, "num_tokens": 19589751.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 462.625, "completions/mean_terminated_length": 462.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.4336838221730308, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.06285226996988058, "learning_rate": 1.9181901263407178e-05, "loss": 0.0025, "num_tokens": 19600252.0, "reward": 1.6428570747375488, "reward_std": 0.3972390294075012, "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3972390294075012, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 404.75, "completions/mean_terminated_length": 404.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4338682899833979, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.054634854197502136, "learning_rate": 1.918062528220892e-05, "loss": 0.0022, "num_tokens": 19624338.0, "reward": 1.7981927394866943, "reward_std": 0.37864968180656433, "rewards/fixed_code_pass_all_test_reward/mean": 0.7981927394866943, "rewards/fixed_code_pass_all_test_reward/std": 0.37864968180656433, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.434052757793765, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.04426986863836646, "learning_rate": 1.9179348349224483e-05, "loss": 0.0018, "num_tokens": 19629152.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4342372256041321, "frac_reward_zero_std": 0.0, "grad_norm": 0.578125, "kl": 0.042994947521947324, "learning_rate": 1.9178070464586255e-05, "loss": 0.0017, "num_tokens": 19639416.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.43442169341449915, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0721307871863246, "learning_rate": 1.9176791628426718e-05, "loss": 0.0029, "num_tokens": 19648121.0, "reward": 0.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 651.875, "completions/mean_terminated_length": 452.4285888671875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.4346061612248663, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.06704360741423443, "learning_rate": 1.9175511840878446e-05, "loss": 0.0027, "num_tokens": 19660784.0, "reward": 0.9375, "reward_std": 0.3788071870803833, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 321.25, "completions/mean_terminated_length": 321.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.43479062903523336, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.06823410396464169, "learning_rate": 1.917423110207413e-05, "loss": 0.0027, "num_tokens": 19669138.0, "reward": 1.0840909481048584, "reward_std": 0.01928478479385376, "rewards/fixed_code_pass_all_test_reward/mean": 0.08409091085195541, "rewards/fixed_code_pass_all_test_reward/std": 0.01928473263978958, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.43497509684560043, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.07440023776143789, "learning_rate": 1.9172949412146542e-05, "loss": 0.003, "num_tokens": 19675701.0, "reward": 1.59375, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.43515956465596756, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.04279282200150192, "learning_rate": 1.917166677122856e-05, "loss": 0.0017, "num_tokens": 19682511.0, "reward": 1.625, "reward_std": 0.2781743109226227, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2781743109226227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.43534403246633463, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.08303224807605147, "learning_rate": 1.9170383179453158e-05, "loss": 0.0033, "num_tokens": 19692043.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4355285002767017, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.04342034482397139, "learning_rate": 1.916909863695341e-05, "loss": 0.0017, "num_tokens": 19698117.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 408.5, "completions/mean_terminated_length": 408.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.43571296808706883, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.03608640073798597, "learning_rate": 1.9167813143862497e-05, "loss": 0.0014, "num_tokens": 19706425.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 219.625, "completions/mean_terminated_length": 219.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.4358974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.07215271471068263, "learning_rate": 1.9166526700313683e-05, "loss": 0.0029, "num_tokens": 19714470.0, "reward": 1.0535714626312256, "reward_std": 0.0739356130361557, "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.436081903707803, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.10744597436860204, "learning_rate": 1.9165239306440336e-05, "loss": 0.0043, "num_tokens": 19718286.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 373.875, "completions/mean_terminated_length": 373.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4362663715181701, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.03611526032909751, "learning_rate": 1.916395096237593e-05, "loss": 0.0014, "num_tokens": 19728413.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 397.375, "completions/mean_terminated_length": 397.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.4364508393285372, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.045328846434131265, "learning_rate": 1.916266166825403e-05, "loss": 0.0018, "num_tokens": 19737520.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 426.625, "completions/mean_terminated_length": 426.625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.43663530713890425, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.057888827519491315, "learning_rate": 1.9161371424208296e-05, "loss": 0.0023, "num_tokens": 19748069.0, "reward": 1.6938775777816772, "reward_std": 0.6113696098327637, "rewards/fixed_code_pass_all_test_reward/mean": 0.8188775777816772, "rewards/fixed_code_pass_all_test_reward/std": 0.3375398814678192, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4368197749492714, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.08168609626591206, "learning_rate": 1.9160080230372502e-05, "loss": 0.0033, "num_tokens": 19755942.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.43700424275963845, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.07749372208490968, "learning_rate": 1.9158788086880502e-05, "loss": 0.0031, "num_tokens": 19761113.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4371887105700055, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.038680328289046884, "learning_rate": 1.9157494993866262e-05, "loss": 0.0015, "num_tokens": 19768133.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 425.625, "completions/mean_terminated_length": 425.625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.43737317838037265, "frac_reward_zero_std": 1.0, "grad_norm": 0.053955078125, "kl": 0.027796739479526877, "learning_rate": 1.915620095146384e-05, "loss": 0.0011, "num_tokens": 19776906.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 381.5, "completions/mean_terminated_length": 381.5, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.4375576461907397, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.020519911544397473, "learning_rate": 1.9154905959807394e-05, "loss": 0.0008, "num_tokens": 19784958.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4377421140011068, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.09658517176285386, "learning_rate": 1.9153610019031177e-05, "loss": 0.0039, "num_tokens": 19788862.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.4379265818114739, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.05363936512731016, "learning_rate": 1.9152313129269545e-05, "loss": 0.0021, "num_tokens": 19798524.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.438111049621841, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.1740614203736186, "learning_rate": 1.9151015290656955e-05, "loss": 0.007, "num_tokens": 19804424.0, "reward": 1.8857526779174805, "reward_std": 0.3231402039527893, "rewards/fixed_code_pass_all_test_reward/mean": 0.8857526779174805, "rewards/fixed_code_pass_all_test_reward/std": 0.3231402337551117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.43829551743220807, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.041146688628941774, "learning_rate": 1.914971650332795e-05, "loss": 0.0016, "num_tokens": 19809361.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.43847998524257514, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.04686918528750539, "learning_rate": 1.9148416767417188e-05, "loss": 0.0019, "num_tokens": 19814150.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.43866445305294227, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05641306610777974, "learning_rate": 1.9147116083059413e-05, "loss": 0.0023, "num_tokens": 19820443.0, "reward": 1.8016917705535889, "reward_std": 0.29313498735427856, "rewards/fixed_code_pass_all_test_reward/mean": 0.8016917705535889, "rewards/fixed_code_pass_all_test_reward/std": 0.2931349575519562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.43884892086330934, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.03580252768006176, "learning_rate": 1.9145814450389472e-05, "loss": 0.0014, "num_tokens": 19829930.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.4390333886736764, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.055946134962141514, "learning_rate": 1.9144511869542312e-05, "loss": 0.0022, "num_tokens": 19834183.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 470.25, "completions/mean_terminated_length": 470.25, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.43921785648404355, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03587186196818948, "learning_rate": 1.914320834065297e-05, "loss": 0.0014, "num_tokens": 19843249.0, "reward": 1.6120129823684692, "reward_std": 0.07649830728769302, "rewards/fixed_code_pass_all_test_reward/mean": 0.6120129823684692, "rewards/fixed_code_pass_all_test_reward/std": 0.07649827003479004, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.4394023242944106, "frac_reward_zero_std": 0.0, "grad_norm": 34.75, "kl": 0.2773497684393078, "learning_rate": 1.914190386385659e-05, "loss": 0.0111, "num_tokens": 19849790.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4395867921047777, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.032679900992661715, "learning_rate": 1.9140598439288412e-05, "loss": 0.0013, "num_tokens": 19855544.0, "reward": 1.440000057220459, "reward_std": 0.3038797080516815, "rewards/fixed_code_pass_all_test_reward/mean": 0.4399999976158142, "rewards/fixed_code_pass_all_test_reward/std": 0.30387967824935913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 453.375, "completions/mean_terminated_length": 453.375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.4397712599151448, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.05411852872930467, "learning_rate": 1.9139292067083776e-05, "loss": 0.0022, "num_tokens": 19867635.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4399557277255119, "frac_reward_zero_std": 1.0, "grad_norm": 0.20703125, "kl": 0.05432027648203075, "learning_rate": 1.9137984747378117e-05, "loss": 0.0022, "num_tokens": 19872507.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.44014019553587896, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.07272415515035391, "learning_rate": 1.9136676480306967e-05, "loss": 0.0029, "num_tokens": 19877521.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4403246633462461, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03546681790612638, "learning_rate": 1.9135367266005957e-05, "loss": 0.0014, "num_tokens": 19887230.0, "reward": 1.8181818723678589, "reward_std": 0.13571275770664215, "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.13571275770664215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.44050913115661317, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.08220332162454724, "learning_rate": 1.913405710461082e-05, "loss": 0.0033, "num_tokens": 19897658.0, "reward": 1.0138888359069824, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.44069359896698024, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.06648998102173209, "learning_rate": 1.9132745996257388e-05, "loss": 0.0027, "num_tokens": 19908681.0, "reward": 1.3914473056793213, "reward_std": 0.15816861391067505, "rewards/fixed_code_pass_all_test_reward/mean": 0.39144736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.15816862881183624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.44087806677734737, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.050412053940817714, "learning_rate": 1.9131433941081585e-05, "loss": 0.002, "num_tokens": 19918490.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 702.875, "completions/mean_terminated_length": 702.875, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.44106253458771444, "frac_reward_zero_std": 0.0, "grad_norm": 0.66015625, "kl": 0.030703907483257353, "learning_rate": 1.9130120939219436e-05, "loss": 0.0012, "num_tokens": 19935185.0, "reward": 1.5, "reward_std": 0.4655483365058899, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4655483663082123, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.4412470023980815, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.03788535506464541, "learning_rate": 1.912880699080706e-05, "loss": 0.0015, "num_tokens": 19939370.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.44143147020844864, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.06428035930730402, "learning_rate": 1.9127492095980685e-05, "loss": 0.0026, "num_tokens": 19946760.0, "reward": 1.9406249523162842, "reward_std": 0.16793784499168396, "rewards/fixed_code_pass_all_test_reward/mean": 0.940625011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.16793787479400635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4416159380188157, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.04636247269809246, "learning_rate": 1.9126176254876633e-05, "loss": 0.0019, "num_tokens": 19953196.0, "reward": 1.0192307233810425, "reward_std": 0.4121977686882019, "rewards/fixed_code_pass_all_test_reward/mean": 0.14423076808452606, "rewards/fixed_code_pass_all_test_reward/std": 0.06081303581595421, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.4418004058291828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.029619957669638097, "learning_rate": 1.912485946763131e-05, "loss": 0.0012, "num_tokens": 19957936.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 403.75, "completions/mean_terminated_length": 403.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.4419848736395499, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.07484762789681554, "learning_rate": 1.9123541734381244e-05, "loss": 0.003, "num_tokens": 19968470.0, "reward": 1.3333333730697632, "reward_std": 0.3563483655452728, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 493.0, "completions/mean_terminated_length": 493.0, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.442169341449917, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.045036432798951864, "learning_rate": 1.9122223055263043e-05, "loss": 0.0018, "num_tokens": 19978118.0, "reward": 1.774999976158142, "reward_std": 0.37701839208602905, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.37701839208602905, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.44235380926028406, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05718409572727978, "learning_rate": 1.912090343041342e-05, "loss": 0.0023, "num_tokens": 19982497.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 336.75, "completions/mean_terminated_length": 336.75, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4425382770706512, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.08714699000120163, "learning_rate": 1.911958285996918e-05, "loss": 0.0035, "num_tokens": 19995127.0, "reward": 1.899999976158142, "reward_std": 0.21380899846553802, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.21380899846553802, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.44272274488101826, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.053044983418658376, "learning_rate": 1.9118261344067236e-05, "loss": 0.0021, "num_tokens": 20000874.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.44290721269138533, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.04983460553921759, "learning_rate": 1.9116938882844596e-05, "loss": 0.002, "num_tokens": 20014149.0, "reward": 1.170258641242981, "reward_std": 0.33526611328125, "rewards/fixed_code_pass_all_test_reward/mean": 0.17025862634181976, "rewards/fixed_code_pass_all_test_reward/std": 0.3352661430835724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.44309168050175246, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.07310786168090999, "learning_rate": 1.9115615476438362e-05, "loss": 0.0029, "num_tokens": 20024089.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.44327614831211953, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.040407335152849555, "learning_rate": 1.9114291124985732e-05, "loss": 0.0016, "num_tokens": 20027804.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4434606161224866, "frac_reward_zero_std": 1.0, "grad_norm": 0.055419921875, "kl": 0.04502365714870393, "learning_rate": 1.911296582862401e-05, "loss": 0.0018, "num_tokens": 20035195.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.44364508393285373, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.041253834031522274, "learning_rate": 1.9111639587490595e-05, "loss": 0.0017, "num_tokens": 20046387.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4438295517432208, "frac_reward_zero_std": 1.0, "grad_norm": 0.6171875, "kl": 0.10233459621667862, "learning_rate": 1.911031240172298e-05, "loss": 0.0041, "num_tokens": 20055140.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.4440140195535879, "frac_reward_zero_std": 1.0, "grad_norm": 0.244140625, "kl": 0.07702045887708664, "learning_rate": 1.9108984271458758e-05, "loss": 0.0031, "num_tokens": 20065054.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.444198487363955, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.03391783335246146, "learning_rate": 1.9107655196835627e-05, "loss": 0.0014, "num_tokens": 20071474.0, "reward": 1.84375, "reward_std": 0.2893187701702118, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4443829551743221, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.08611002354882658, "learning_rate": 1.910632517799137e-05, "loss": 0.0034, "num_tokens": 20082280.0, "reward": 1.7300000190734863, "reward_std": 0.3726353943347931, "rewards/fixed_code_pass_all_test_reward/mean": 0.7300000190734863, "rewards/fixed_code_pass_all_test_reward/std": 0.3726354241371155, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.44456742298468915, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.06404017843306065, "learning_rate": 1.9104994215063876e-05, "loss": 0.0026, "num_tokens": 20088191.0, "reward": 1.6273585557937622, "reward_std": 0.22579647600650787, "rewards/fixed_code_pass_all_test_reward/mean": 0.6273584961891174, "rewards/fixed_code_pass_all_test_reward/std": 0.22579655051231384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4447518907950563, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.05312766553834081, "learning_rate": 1.910366230819113e-05, "loss": 0.0021, "num_tokens": 20095316.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.44493635860542335, "frac_reward_zero_std": 1.0, "grad_norm": 0.049560546875, "kl": 0.028320763492956758, "learning_rate": 1.9102329457511217e-05, "loss": 0.0011, "num_tokens": 20102418.0, "reward": 1.9090909957885742, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4451208264157904, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.05380662437528372, "learning_rate": 1.9100995663162317e-05, "loss": 0.0022, "num_tokens": 20110430.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.44530529422615756, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.04163537628483027, "learning_rate": 1.909966092528271e-05, "loss": 0.0017, "num_tokens": 20120828.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.44548976203652463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.04899488063529134, "learning_rate": 1.9098325244010773e-05, "loss": 0.002, "num_tokens": 20125148.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 490.875, "completions/mean_terminated_length": 490.875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.4456742298468917, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.05311124725267291, "learning_rate": 1.9096988619484977e-05, "loss": 0.0021, "num_tokens": 20134259.0, "reward": 1.3214285373687744, "reward_std": 0.6296836137771606, "rewards/fixed_code_pass_all_test_reward/mean": 0.4464285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.3794080317020416, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 528.875, "completions/mean_terminated_length": 528.875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.44585869765725883, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.023211789783090353, "learning_rate": 1.90956510518439e-05, "loss": 0.0009, "num_tokens": 20142970.0, "reward": 1.375, "reward_std": 0.40089187026023865, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.40089187026023865, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.4460431654676259, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.06979157985188067, "learning_rate": 1.9094312541226207e-05, "loss": 0.0028, "num_tokens": 20149770.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.446227633277993, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.06892778677865863, "learning_rate": 1.909297308777067e-05, "loss": 0.0028, "num_tokens": 20153699.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.4464121010883601, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.03642709576524794, "learning_rate": 1.909163269161615e-05, "loss": 0.0015, "num_tokens": 20158269.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.4465965688987272, "frac_reward_zero_std": 1.0, "grad_norm": 0.48828125, "kl": 0.10374970780685544, "learning_rate": 1.9090291352901615e-05, "loss": 0.0041, "num_tokens": 20162591.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.44678103670909425, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.06303893262520432, "learning_rate": 1.9088949071766124e-05, "loss": 0.0025, "num_tokens": 20172038.0, "reward": 1.8430233001708984, "reward_std": 0.21664845943450928, "rewards/fixed_code_pass_all_test_reward/mean": 0.8430233001708984, "rewards/fixed_code_pass_all_test_reward/std": 0.21664850413799286, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4469655045194614, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.04099803953431547, "learning_rate": 1.9087605848348834e-05, "loss": 0.0016, "num_tokens": 20177364.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 431.125, "completions/mean_terminated_length": 431.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.44714997232982845, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.045270358212292194, "learning_rate": 1.9086261682789004e-05, "loss": 0.0018, "num_tokens": 20187005.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.4473344401401955, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.04848519433289766, "learning_rate": 1.9084916575225988e-05, "loss": 0.0019, "num_tokens": 20195752.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.44751890795056265, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.06755637563765049, "learning_rate": 1.908357052579924e-05, "loss": 0.0027, "num_tokens": 20205452.0, "reward": 1.9196429252624512, "reward_std": 0.2272842973470688, "rewards/fixed_code_pass_all_test_reward/mean": 0.9196428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.22728432714939117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 308.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4477033757609297, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.04701856151223183, "learning_rate": 1.90822235346483e-05, "loss": 0.0019, "num_tokens": 20214624.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 241.875, "completions/mean_terminated_length": 241.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.4478878435712968, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.04325226484797895, "learning_rate": 1.9080875601912824e-05, "loss": 0.0017, "num_tokens": 20220095.0, "reward": 1.859375, "reward_std": 0.26252126693725586, "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, "rewards/fixed_code_pass_all_test_reward/std": 0.26252126693725586, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 327.875, "completions/mean_terminated_length": 327.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4480723113816639, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.02845128159970045, "learning_rate": 1.9079526727732558e-05, "loss": 0.0011, "num_tokens": 20229342.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 359.75, "completions/mean_terminated_length": 359.75, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.448256779192031, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.042596359038725495, "learning_rate": 1.9078176912247336e-05, "loss": 0.0017, "num_tokens": 20239964.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.44844124700239807, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.049838698003441095, "learning_rate": 1.9076826155597108e-05, "loss": 0.002, "num_tokens": 20243925.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 425.5, "completions/mean_terminated_length": 425.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4486257148127652, "frac_reward_zero_std": 1.0, "grad_norm": 0.037109375, "kl": 0.015027020941488445, "learning_rate": 1.90754744579219e-05, "loss": 0.0006, "num_tokens": 20252825.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.44881018262313227, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.06528020044788718, "learning_rate": 1.9074121819361856e-05, "loss": 0.0026, "num_tokens": 20257092.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.44899465043349934, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0668763059657067, "learning_rate": 1.907276824005721e-05, "loss": 0.0027, "num_tokens": 20263441.0, "reward": 1.1141304969787598, "reward_std": 0.3228096067905426, "rewards/fixed_code_pass_all_test_reward/mean": 0.11413043737411499, "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.44917911824386647, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.05478463624604046, "learning_rate": 1.9071413720148283e-05, "loss": 0.0022, "num_tokens": 20273541.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 444.0, "completions/mean_terminated_length": 444.0, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.44936358605423354, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.03800552221946418, "learning_rate": 1.907005825977551e-05, "loss": 0.0015, "num_tokens": 20282245.0, "reward": 1.8928570747375488, "reward_std": 0.30304577946662903, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 398.0, "completions/mean_terminated_length": 398.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.4495480538646006, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.028570421738550067, "learning_rate": 1.9068701859079408e-05, "loss": 0.0011, "num_tokens": 20290149.0, "reward": 1.9854650497436523, "reward_std": 0.012036032974720001, "rewards/fixed_code_pass_all_test_reward/mean": 0.9854651093482971, "rewards/fixed_code_pass_all_test_reward/std": 0.012036033906042576, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 397.0, "completions/mean_terminated_length": 397.0, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.44973252167496774, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.041548533365130424, "learning_rate": 1.906734451820061e-05, "loss": 0.0017, "num_tokens": 20297957.0, "reward": 1.1029412746429443, "reward_std": 0.0415944829583168, "rewards/fixed_code_pass_all_test_reward/mean": 0.10294117778539658, "rewards/fixed_code_pass_all_test_reward/std": 0.04159452021121979, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4499169894853348, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.0731375110335648, "learning_rate": 1.906598623727983e-05, "loss": 0.0029, "num_tokens": 20309536.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 703.0, "completions/mean_terminated_length": 510.857177734375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.4501014572957019, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.03679737285710871, "learning_rate": 1.9064627016457885e-05, "loss": 0.0015, "num_tokens": 20323072.0, "reward": 1.6628788709640503, "reward_std": 0.6728243231773376, "rewards/fixed_code_pass_all_test_reward/mean": 0.7878787517547607, "rewards/fixed_code_pass_all_test_reward/std": 0.320287823677063, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.450285925106069, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.058982398360967636, "learning_rate": 1.9063266855875695e-05, "loss": 0.0024, "num_tokens": 20327993.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4504703929164361, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.06672487175092101, "learning_rate": 1.906190575567427e-05, "loss": 0.0027, "num_tokens": 20332233.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.45065486072680316, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0702868674416095, "learning_rate": 1.9060543715994713e-05, "loss": 0.0028, "num_tokens": 20341659.0, "reward": 1.8990384340286255, "reward_std": 0.28556233644485474, "rewards/fixed_code_pass_all_test_reward/mean": 0.8990384340286255, "rewards/fixed_code_pass_all_test_reward/std": 0.2855623662471771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.45083932853717024, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.03574440581724048, "learning_rate": 1.9059180736978242e-05, "loss": 0.0014, "num_tokens": 20346766.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.45102379634753736, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.027888314565643668, "learning_rate": 1.9057816818766156e-05, "loss": 0.0011, "num_tokens": 20355869.0, "reward": 1.875, "reward_std": 0.1649916023015976, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1649915874004364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.45120826415790444, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.07331628212705255, "learning_rate": 1.905645196149986e-05, "loss": 0.0029, "num_tokens": 20359642.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.4513927319682715, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.08381405915133655, "learning_rate": 1.905508616532085e-05, "loss": 0.0034, "num_tokens": 20369992.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 96.875, "completions/mean_terminated_length": 96.875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.45157719977863864, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.05653827963396907, "learning_rate": 1.9053719430370717e-05, "loss": 0.0023, "num_tokens": 20373463.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4517616675890057, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.0568434433080256, "learning_rate": 1.905235175679117e-05, "loss": 0.0023, "num_tokens": 20380992.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 352.625, "completions/mean_terminated_length": 352.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.4519461353993728, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.03464179555885494, "learning_rate": 1.905098314472399e-05, "loss": 0.0014, "num_tokens": 20389621.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.4521306032097399, "frac_reward_zero_std": 1.0, "grad_norm": 0.26953125, "kl": 0.06937766401097178, "learning_rate": 1.9049613594311066e-05, "loss": 0.0028, "num_tokens": 20394641.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.452315071020107, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02606621861923486, "learning_rate": 1.9048243105694383e-05, "loss": 0.001, "num_tokens": 20401158.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 600.75, "completions/mean_terminated_length": 600.75, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.45249953883047406, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.030177247244864702, "learning_rate": 1.9046871679016033e-05, "loss": 0.0012, "num_tokens": 20414612.0, "reward": 1.0208332538604736, "reward_std": 0.012858637608587742, "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, "rewards/fixed_code_pass_all_test_reward/std": 0.012858612462878227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 469.25, "completions/mean_terminated_length": 469.25, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.4526840066408412, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.04229787411168218, "learning_rate": 1.9045499314418186e-05, "loss": 0.0017, "num_tokens": 20423582.0, "reward": 1.567307710647583, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.45286847445120826, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.05416658194735646, "learning_rate": 1.9044126012043125e-05, "loss": 0.0022, "num_tokens": 20432210.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.45305294226157533, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.056523178005591035, "learning_rate": 1.904275177203322e-05, "loss": 0.0023, "num_tokens": 20442121.0, "reward": 1.476190447807312, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4761904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 428.875, "completions/mean_terminated_length": 428.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.45323741007194246, "frac_reward_zero_std": 1.0, "grad_norm": 0.189453125, "kl": 0.0615214582066983, "learning_rate": 1.9041376594530953e-05, "loss": 0.0025, "num_tokens": 20454408.0, "reward": 1.2142857313156128, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2142857164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 443.625, "completions/mean_terminated_length": 443.625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.45342187788230953, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04897480271756649, "learning_rate": 1.9040000479678885e-05, "loss": 0.002, "num_tokens": 20465821.0, "reward": 1.8977272510528564, "reward_std": 0.11331124603748322, "rewards/fixed_code_pass_all_test_reward/mean": 0.8977273106575012, "rewards/fixed_code_pass_all_test_reward/std": 0.11331123113632202, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.4536063456926766, "frac_reward_zero_std": 1.0, "grad_norm": 0.056884765625, "kl": 0.042797015281394124, "learning_rate": 1.903862342761968e-05, "loss": 0.0017, "num_tokens": 20470677.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 125.125, "completions/mean_terminated_length": 125.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.45379081350304373, "frac_reward_zero_std": 0.0, "grad_norm": 2.703125, "kl": 0.07229993329383433, "learning_rate": 1.9037245438496107e-05, "loss": 0.0029, "num_tokens": 20474542.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.4539752813134108, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.06182421091943979, "learning_rate": 1.9035866512451032e-05, "loss": 0.0025, "num_tokens": 20483283.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 768.25, "completions/mean_terminated_length": 768.25, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.4541597491237779, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.02068014396354556, "learning_rate": 1.90344866496274e-05, "loss": 0.0008, "num_tokens": 20496013.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 566.0, "completions/mean_terminated_length": 354.2857360839844, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.454344216934145, "frac_reward_zero_std": 0.0, "grad_norm": 0.4375, "kl": 0.055398496333509684, "learning_rate": 1.9033105850168273e-05, "loss": 0.0022, "num_tokens": 20507501.0, "reward": 1.6936274766921997, "reward_std": 0.7022891044616699, "rewards/fixed_code_pass_all_test_reward/mean": 0.8186274766921997, "rewards/fixed_code_pass_all_test_reward/std": 0.3664921224117279, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 482.875, "completions/mean_terminated_length": 482.875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.4545286847445121, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.027201927616260946, "learning_rate": 1.9031724114216804e-05, "loss": 0.0011, "num_tokens": 20515948.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.45471315255487915, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.05558465304784477, "learning_rate": 1.903034144191624e-05, "loss": 0.0022, "num_tokens": 20523169.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4548976203652463, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.09402292827144265, "learning_rate": 1.9028957833409933e-05, "loss": 0.0038, "num_tokens": 20532198.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 494.875, "completions/mean_terminated_length": 494.875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.45508208817561335, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.052129237446933985, "learning_rate": 1.9027573288841315e-05, "loss": 0.0021, "num_tokens": 20540973.0, "reward": 1.4826388359069824, "reward_std": 0.6010971665382385, "rewards/fixed_code_pass_all_test_reward/mean": 0.6076388359069824, "rewards/fixed_code_pass_all_test_reward/std": 0.25041303038597107, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 399.625, "completions/mean_terminated_length": 399.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4552665559859804, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.020885087666101754, "learning_rate": 1.9026187808353935e-05, "loss": 0.0008, "num_tokens": 20548170.0, "reward": 1.6607142686843872, "reward_std": 0.716069221496582, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 407.25, "completions/mean_terminated_length": 407.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.45545102379634755, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.06685170345008373, "learning_rate": 1.9024801392091427e-05, "loss": 0.0027, "num_tokens": 20557380.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 336.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4556354916067146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.05506485025398433, "learning_rate": 1.902341404019753e-05, "loss": 0.0022, "num_tokens": 20563405.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4558199594170817, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.07170854299329221, "learning_rate": 1.902202575281607e-05, "loss": 0.0029, "num_tokens": 20568390.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.4560044272274488, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.040886704344302416, "learning_rate": 1.9020636530090976e-05, "loss": 0.0016, "num_tokens": 20577903.0, "reward": 1.8305084705352783, "reward_std": 0.3495918810367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.8305084705352783, "rewards/fixed_code_pass_all_test_reward/std": 0.3495918810367584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 236.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4561888950378159, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.06330850347876549, "learning_rate": 1.901924637216628e-05, "loss": 0.0025, "num_tokens": 20583045.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.456373362848183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.019923260435461998, "learning_rate": 1.901785527918609e-05, "loss": 0.0008, "num_tokens": 20588459.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4565578306585501, "frac_reward_zero_std": 1.0, "grad_norm": 0.03955078125, "kl": 0.046889389865100384, "learning_rate": 1.9016463251294644e-05, "loss": 0.0019, "num_tokens": 20595523.0, "reward": 1.8518519401550293, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8518518805503845, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 327.625, "completions/mean_terminated_length": 327.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4567422984689172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.031182186910882592, "learning_rate": 1.9015070288636243e-05, "loss": 0.0012, "num_tokens": 20601832.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.45692676627928425, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.060395329259335995, "learning_rate": 1.901367639135531e-05, "loss": 0.0024, "num_tokens": 20609243.0, "reward": 1.735795497894287, "reward_std": 0.10445894300937653, "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.10445895045995712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 409.75, "completions/mean_terminated_length": 409.75, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.4571112340896514, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578125, "kl": 0.05128108547069132, "learning_rate": 1.9012281559596344e-05, "loss": 0.0021, "num_tokens": 20617425.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.45729570190001845, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.04223368666134775, "learning_rate": 1.9010885793503965e-05, "loss": 0.0017, "num_tokens": 20621714.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 419.125, "completions/mean_terminated_length": 419.125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.4574801697103855, "frac_reward_zero_std": 0.0, "grad_norm": 0.71875, "kl": 0.03437046578619629, "learning_rate": 1.9009489093222865e-05, "loss": 0.0014, "num_tokens": 20630259.0, "reward": 1.567307710647583, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.45766463752075265, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.0721117933280766, "learning_rate": 1.9008091458897854e-05, "loss": 0.0029, "num_tokens": 20640407.0, "reward": 1.3199999332427979, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3199999928474426, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4578491053311197, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.05214321683160961, "learning_rate": 1.9006692890673823e-05, "loss": 0.0021, "num_tokens": 20648188.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.4580335731414868, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.03493988059926778, "learning_rate": 1.9005293388695772e-05, "loss": 0.0014, "num_tokens": 20657671.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.4582180409518539, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.044305355520918965, "learning_rate": 1.9003892953108788e-05, "loss": 0.0018, "num_tokens": 20662021.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.458402508762221, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.05712498165667057, "learning_rate": 1.9002491584058055e-05, "loss": 0.0023, "num_tokens": 20669832.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 415.625, "completions/mean_terminated_length": 415.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.45858697657258807, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.029951917356811464, "learning_rate": 1.9001089281688867e-05, "loss": 0.0012, "num_tokens": 20676653.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4587714443829552, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.06121822237037122, "learning_rate": 1.8999686046146598e-05, "loss": 0.0024, "num_tokens": 20686389.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.45895591219332227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.04804081073962152, "learning_rate": 1.899828187757673e-05, "loss": 0.0019, "num_tokens": 20693246.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 643.75, "completions/mean_terminated_length": 643.75, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.45914038000368934, "frac_reward_zero_std": 0.0, "grad_norm": 0.58203125, "kl": 0.03889798605814576, "learning_rate": 1.8996876776124836e-05, "loss": 0.0016, "num_tokens": 20711900.0, "reward": 1.8020833730697632, "reward_std": 0.3240906596183777, "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.45932484781405647, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.09396759839728475, "learning_rate": 1.8995470741936588e-05, "loss": 0.0038, "num_tokens": 20719386.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.45950931562442354, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05584133695811033, "learning_rate": 1.8994063775157757e-05, "loss": 0.0022, "num_tokens": 20727564.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4596937834347906, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.03982403548434377, "learning_rate": 1.89926558759342e-05, "loss": 0.0016, "num_tokens": 20737028.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 324.0, "completions/mean_terminated_length": 324.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.45987825124515774, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.056834566057659686, "learning_rate": 1.8991247044411886e-05, "loss": 0.0023, "num_tokens": 20747052.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.4600627190555248, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.05896364198997617, "learning_rate": 1.898983728073687e-05, "loss": 0.0024, "num_tokens": 20753507.0, "reward": 1.671875, "reward_std": 0.3716367185115814, "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, "rewards/fixed_code_pass_all_test_reward/std": 0.3716367185115814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.4602471868658919, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.07373520731925964, "learning_rate": 1.8988426585055313e-05, "loss": 0.0029, "num_tokens": 20763556.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.460431654676259, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.06801700964570045, "learning_rate": 1.8987014957513458e-05, "loss": 0.0027, "num_tokens": 20777987.0, "reward": 1.9302325248718262, "reward_std": 0.04972304776310921, "rewards/fixed_code_pass_all_test_reward/mean": 0.9302325248718262, "rewards/fixed_code_pass_all_test_reward/std": 0.04972302168607712, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4606161224866261, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.027140547521412373, "learning_rate": 1.898560239825766e-05, "loss": 0.0011, "num_tokens": 20784023.0, "reward": 1.7586207389831543, "reward_std": 0.09753197431564331, "rewards/fixed_code_pass_all_test_reward/mean": 0.7586206793785095, "rewards/fixed_code_pass_all_test_reward/std": 0.09753198176622391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 438.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.46080059029699316, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "kl": 0.04661717941053212, "learning_rate": 1.8984188907434356e-05, "loss": 0.0019, "num_tokens": 20797905.0, "reward": 1.5348360538482666, "reward_std": 0.4236942529678345, "rewards/fixed_code_pass_all_test_reward/mean": 0.5348360538482666, "rewards/fixed_code_pass_all_test_reward/std": 0.42369428277015686, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 296.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.4609850581073603, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.10119939292781055, "learning_rate": 1.8982774485190094e-05, "loss": 0.004, "num_tokens": 20806525.0, "reward": 1.943750023841858, "reward_std": 0.1590990275144577, "rewards/fixed_code_pass_all_test_reward/mean": 0.9437500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1590990275144577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.46116952591772736, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.03617412189487368, "learning_rate": 1.8981359131671514e-05, "loss": 0.0014, "num_tokens": 20811478.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.46135399372809444, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.04275578772649169, "learning_rate": 1.897994284702534e-05, "loss": 0.0017, "num_tokens": 20821498.0, "reward": 1.3829786777496338, "reward_std": 0.8596827983856201, "rewards/fixed_code_pass_all_test_reward/mean": 0.6329787373542786, "rewards/fixed_code_pass_all_test_reward/std": 0.4038151800632477, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.46153846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04971025721170008, "learning_rate": 1.897852563139841e-05, "loss": 0.002, "num_tokens": 20830990.0, "reward": 1.859375, "reward_std": 0.2603869140148163, "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, "rewards/fixed_code_pass_all_test_reward/std": 0.2603869140148163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.46172292934882864, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.05455705150961876, "learning_rate": 1.8977107484937652e-05, "loss": 0.0022, "num_tokens": 20838673.0, "reward": 1.8352272510528564, "reward_std": 0.3562045097351074, "rewards/fixed_code_pass_all_test_reward/mean": 0.9602272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.016070615500211716, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 407.75, "completions/mean_terminated_length": 407.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.4619073971591957, "frac_reward_zero_std": 1.0, "grad_norm": 0.048583984375, "kl": 0.028918299009092152, "learning_rate": 1.8975688407790093e-05, "loss": 0.0012, "num_tokens": 20846263.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 284.0, "completions/mean_terminated_length": 284.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.46209186496956284, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03379430738277733, "learning_rate": 1.8974268400102845e-05, "loss": 0.0014, "num_tokens": 20855223.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4622763327799299, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.049331111600622535, "learning_rate": 1.897284746202313e-05, "loss": 0.002, "num_tokens": 20863324.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 555.125, "completions/mean_terminated_length": 555.125, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.462460800590297, "frac_reward_zero_std": 0.0, "grad_norm": 0.484375, "kl": 0.01851571281440556, "learning_rate": 1.897142559369826e-05, "loss": 0.0007, "num_tokens": 20883053.0, "reward": 1.9153225421905518, "reward_std": 0.23950394988059998, "rewards/fixed_code_pass_all_test_reward/mean": 0.9153225421905518, "rewards/fixed_code_pass_all_test_reward/std": 0.2395039200782776, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 581.5, "completions/mean_terminated_length": 581.5, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.4626452684006641, "frac_reward_zero_std": 0.0, "grad_norm": 0.48828125, "kl": 0.018425900605507195, "learning_rate": 1.8970002795275645e-05, "loss": 0.0007, "num_tokens": 20894641.0, "reward": 1.2291666269302368, "reward_std": 0.3204349875450134, "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.32043495774269104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.4628297362110312, "frac_reward_zero_std": 1.0, "grad_norm": 0.462890625, "kl": 0.07411512825638056, "learning_rate": 1.896857906690279e-05, "loss": 0.003, "num_tokens": 20904699.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 213.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.46301420402139826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.05978736048564315, "learning_rate": 1.8967154408727303e-05, "loss": 0.0024, "num_tokens": 20913742.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 475.25, "completions/mean_terminated_length": 475.25, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.46319867183176533, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.0681481622159481, "learning_rate": 1.8965728820896882e-05, "loss": 0.0027, "num_tokens": 20927560.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.46338313964213246, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.056887043407186866, "learning_rate": 1.8964302303559315e-05, "loss": 0.0023, "num_tokens": 20934891.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 424.25, "completions/mean_terminated_length": 424.25, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.46356760745249953, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.05755751917604357, "learning_rate": 1.89628748568625e-05, "loss": 0.0023, "num_tokens": 20946717.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4637520752628666, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.042790445033460855, "learning_rate": 1.8961446480954422e-05, "loss": 0.0017, "num_tokens": 20951377.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 643.125, "completions/mean_terminated_length": 643.125, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.46393654307323373, "frac_reward_zero_std": 0.0, "grad_norm": 0.68359375, "kl": 0.02398298680782318, "learning_rate": 1.896001717598317e-05, "loss": 0.001, "num_tokens": 20971778.0, "reward": 1.5645160675048828, "reward_std": 0.46555182337760925, "rewards/fixed_code_pass_all_test_reward/mean": 0.5645161271095276, "rewards/fixed_code_pass_all_test_reward/std": 0.46555185317993164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.4641210108836008, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.02868594799656421, "learning_rate": 1.8958586942096922e-05, "loss": 0.0011, "num_tokens": 20978836.0, "reward": 1.7678570747375488, "reward_std": 0.3657134771347046, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.19839002192020416, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4643054786939679, "frac_reward_zero_std": 1.0, "grad_norm": 0.047119140625, "kl": 0.050590952625498176, "learning_rate": 1.8957155779443956e-05, "loss": 0.002, "num_tokens": 20985579.0, "reward": 1.7058823108673096, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.464489946504335, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.0547384072560817, "learning_rate": 1.8955723688172645e-05, "loss": 0.0022, "num_tokens": 20989423.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.4646744143147021, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.06283629802055657, "learning_rate": 1.8954290668431458e-05, "loss": 0.0025, "num_tokens": 21001265.0, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.46485888212506915, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.06071718130260706, "learning_rate": 1.895285672036896e-05, "loss": 0.0024, "num_tokens": 21009270.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 382.75, "completions/mean_terminated_length": 382.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.4650433499354363, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04056950914673507, "learning_rate": 1.8951421844133815e-05, "loss": 0.0016, "num_tokens": 21021420.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.46522781774580335, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.06935037020593882, "learning_rate": 1.8949986039874782e-05, "loss": 0.0028, "num_tokens": 21029115.0, "reward": 1.9709820747375488, "reward_std": 0.07511989027261734, "rewards/fixed_code_pass_all_test_reward/mean": 0.9709821343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.07511986792087555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4654122855561704, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.1340021677315235, "learning_rate": 1.8948549307740714e-05, "loss": 0.0054, "num_tokens": 21035690.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 586.25, "completions/mean_terminated_length": 586.25, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.46559675336653755, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.06794543284922838, "learning_rate": 1.8947111647880567e-05, "loss": 0.0027, "num_tokens": 21051292.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4657812211769046, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.04875321383588016, "learning_rate": 1.894567306044338e-05, "loss": 0.0019, "num_tokens": 21060889.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.4659656889872717, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.04893139365594834, "learning_rate": 1.89442335455783e-05, "loss": 0.002, "num_tokens": 21071419.0, "reward": 1.0392156839370728, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.03921568766236305, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.4661501567976388, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03456734656356275, "learning_rate": 1.8942793103434566e-05, "loss": 0.0014, "num_tokens": 21076352.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4663346246080059, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.1055802870541811, "learning_rate": 1.894135173416151e-05, "loss": 0.0042, "num_tokens": 21080742.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 376.75, "completions/mean_terminated_length": 376.75, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.46651909241837297, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.0870319688692689, "learning_rate": 1.8939909437908576e-05, "loss": 0.0035, "num_tokens": 21093580.0, "reward": 1.1749999523162842, "reward_std": 0.345377653837204, "rewards/fixed_code_pass_all_test_reward/mean": 0.17499999701976776, "rewards/fixed_code_pass_all_test_reward/std": 0.3453776240348816, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 528.5, "completions/mean_terminated_length": 528.5, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.4667035602287401, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.02256252709776163, "learning_rate": 1.8938466214825277e-05, "loss": 0.0009, "num_tokens": 21103144.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 364.875, "completions/mean_terminated_length": 364.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4668880280391072, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.04564733384177089, "learning_rate": 1.8937022065061246e-05, "loss": 0.0018, "num_tokens": 21112031.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.46707249584947425, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.06591849634423852, "learning_rate": 1.8935576988766194e-05, "loss": 0.0026, "num_tokens": 21118021.0, "reward": 1.388157844543457, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.5131579041481018, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4672569636598414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.024012718466110528, "learning_rate": 1.8934130986089947e-05, "loss": 0.001, "num_tokens": 21122225.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.46744143147020845, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.05557045107707381, "learning_rate": 1.893268405718241e-05, "loss": 0.0022, "num_tokens": 21130731.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 523.75, "completions/mean_terminated_length": 523.75, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.4676258992805755, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.046769768465310335, "learning_rate": 1.8931236202193596e-05, "loss": 0.0019, "num_tokens": 21139505.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 408.875, "completions/mean_terminated_length": 408.875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.46781036709094265, "frac_reward_zero_std": 1.0, "grad_norm": 0.671875, "kl": 0.06395402282942086, "learning_rate": 1.8929787421273606e-05, "loss": 0.0026, "num_tokens": 21152872.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.4679948349013097, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.03023316792678088, "learning_rate": 1.8928337714572638e-05, "loss": 0.0012, "num_tokens": 21164488.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4681793027116768, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.03262103453744203, "learning_rate": 1.892688708224099e-05, "loss": 0.0013, "num_tokens": 21173523.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.4683637705220439, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0658258976181969, "learning_rate": 1.8925435524429058e-05, "loss": 0.0026, "num_tokens": 21181130.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.468548238332411, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.03415358567144722, "learning_rate": 1.892398304128732e-05, "loss": 0.0014, "num_tokens": 21186350.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 344.875, "completions/mean_terminated_length": 344.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.46873270614277807, "frac_reward_zero_std": 0.0, "grad_norm": 0.671875, "kl": 0.019386149593628943, "learning_rate": 1.892252963296637e-05, "loss": 0.0008, "num_tokens": 21193189.0, "reward": 1.9444444179534912, "reward_std": 0.15713484585285187, "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.15713483095169067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.4689171739531452, "frac_reward_zero_std": 1.0, "grad_norm": 0.26953125, "kl": 0.08176001068204641, "learning_rate": 1.892107529961688e-05, "loss": 0.0033, "num_tokens": 21201015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 348.125, "completions/mean_terminated_length": 348.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.46910164176351227, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.020782180363312364, "learning_rate": 1.8919620041389634e-05, "loss": 0.0008, "num_tokens": 21208952.0, "reward": 1.9107142686843872, "reward_std": 0.25253817439079285, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.46928610957387934, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.053500829730182886, "learning_rate": 1.8918163858435498e-05, "loss": 0.0021, "num_tokens": 21213610.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.46947057738424647, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.06531019671820104, "learning_rate": 1.8916706750905436e-05, "loss": 0.0026, "num_tokens": 21217943.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.46965504519461354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.047468837117776275, "learning_rate": 1.891524871895052e-05, "loss": 0.0019, "num_tokens": 21224875.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4698395130049806, "frac_reward_zero_std": 1.0, "grad_norm": 0.20703125, "kl": 0.05453839199617505, "learning_rate": 1.8913789762721898e-05, "loss": 0.0022, "num_tokens": 21233950.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.47002398081534774, "frac_reward_zero_std": 1.0, "grad_norm": 0.2451171875, "kl": 0.08024826692417264, "learning_rate": 1.8912329882370838e-05, "loss": 0.0032, "num_tokens": 21240567.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 548.75, "completions/mean_terminated_length": 548.75, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.4702084486257148, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.03550759796053171, "learning_rate": 1.891086907804868e-05, "loss": 0.0014, "num_tokens": 21254925.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4703929164360819, "frac_reward_zero_std": 1.0, "grad_norm": 0.2275390625, "kl": 0.07591953198425472, "learning_rate": 1.8909407349906876e-05, "loss": 0.003, "num_tokens": 21258955.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.470577384246449, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.07420408120378852, "learning_rate": 1.890794469809696e-05, "loss": 0.003, "num_tokens": 21269687.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 356.25, "completions/mean_terminated_length": 356.25, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.4707618520568161, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.04486829321831465, "learning_rate": 1.8906481122770586e-05, "loss": 0.0018, "num_tokens": 21280233.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 357.125, "completions/mean_terminated_length": 357.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.47094631986718316, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.08316498715430498, "learning_rate": 1.8905016624079472e-05, "loss": 0.0033, "num_tokens": 21290058.0, "reward": 1.85326087474823, "reward_std": 0.1592201292514801, "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.15922017395496368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4711307876775503, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.04232099512591958, "learning_rate": 1.8903551202175457e-05, "loss": 0.0017, "num_tokens": 21298975.0, "reward": 1.9107143878936768, "reward_std": 0.20112654566764832, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.2011265754699707, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.47131525548791736, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.04296208010055125, "learning_rate": 1.890208485721046e-05, "loss": 0.0017, "num_tokens": 21303685.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.47149972329828443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.048732723109424114, "learning_rate": 1.890061758933651e-05, "loss": 0.0019, "num_tokens": 21310058.0, "reward": 1.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 306.625, "completions/mean_terminated_length": 306.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.47168419110865156, "frac_reward_zero_std": 0.0, "grad_norm": 3.609375, "kl": 0.07097500725649297, "learning_rate": 1.8899149398705714e-05, "loss": 0.0028, "num_tokens": 21319199.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.47186865891901864, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.059380451566539705, "learning_rate": 1.889768028547029e-05, "loss": 0.0024, "num_tokens": 21328142.0, "reward": 1.6176470518112183, "reward_std": 0.4111640155315399, "rewards/fixed_code_pass_all_test_reward/mean": 0.6176470518112183, "rewards/fixed_code_pass_all_test_reward/std": 0.4111640453338623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 418.125, "completions/mean_terminated_length": 418.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.4720531267293857, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.06535853166133165, "learning_rate": 1.8896210249782546e-05, "loss": 0.0026, "num_tokens": 21337319.0, "reward": 1.6875, "reward_std": 0.22160130739212036, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.22160132229328156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.47223759453975284, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.051073907408863306, "learning_rate": 1.8894739291794882e-05, "loss": 0.002, "num_tokens": 21343499.0, "reward": 1.7336957454681396, "reward_std": 0.3419041931629181, "rewards/fixed_code_pass_all_test_reward/mean": 0.7336956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.3419041633605957, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 644.75, "completions/mean_terminated_length": 644.75, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.4724220623501199, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.0492192980600521, "learning_rate": 1.88932674116598e-05, "loss": 0.002, "num_tokens": 21357785.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.472606530160487, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.08145656669512391, "learning_rate": 1.889179460952989e-05, "loss": 0.0033, "num_tokens": 21368445.0, "reward": 1.7984694242477417, "reward_std": 0.18526604771614075, "rewards/fixed_code_pass_all_test_reward/mean": 0.7984694242477417, "rewards/fixed_code_pass_all_test_reward/std": 0.18526601791381836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 382.875, "completions/mean_terminated_length": 382.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4727909979708541, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.03712722868658602, "learning_rate": 1.8890320885557855e-05, "loss": 0.0015, "num_tokens": 21375828.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4729754657812212, "frac_reward_zero_std": 1.0, "grad_norm": 0.050048828125, "kl": 0.03660167055204511, "learning_rate": 1.8888846239896465e-05, "loss": 0.0015, "num_tokens": 21385938.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.47315993359158826, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.04196989326737821, "learning_rate": 1.8887370672698614e-05, "loss": 0.0017, "num_tokens": 21391536.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4733444014019554, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.046252730302512646, "learning_rate": 1.8885894184117267e-05, "loss": 0.0019, "num_tokens": 21395797.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.47352886921232246, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.05251197947654873, "learning_rate": 1.8884416774305508e-05, "loss": 0.0021, "num_tokens": 21405546.0, "reward": 1.8392857313156128, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 349.25, "completions/mean_terminated_length": 349.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.47371333702268953, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.07412926107645035, "learning_rate": 1.8882938443416502e-05, "loss": 0.003, "num_tokens": 21415300.0, "reward": 1.4456522464752197, "reward_std": 0.3935409188270569, "rewards/fixed_code_pass_all_test_reward/mean": 0.570652186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.23057831823825836, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.47389780483305666, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.06294345809146762, "learning_rate": 1.8881459191603504e-05, "loss": 0.0025, "num_tokens": 21425628.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.47408227264342373, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.06253949785605073, "learning_rate": 1.8879979019019886e-05, "loss": 0.0025, "num_tokens": 21432945.0, "reward": 1.60869562625885, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6086956262588501, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 384.625, "completions/mean_terminated_length": 384.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.4742667404537908, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.052997185848653316, "learning_rate": 1.8878497925819094e-05, "loss": 0.0021, "num_tokens": 21438998.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.47445120826415793, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.03089377167634666, "learning_rate": 1.887701591215468e-05, "loss": 0.0012, "num_tokens": 21445117.0, "reward": 1.183333396911621, "reward_std": 0.329983115196228, "rewards/fixed_code_pass_all_test_reward/mean": 0.18333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3299831748008728, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.474635676074525, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.04278409038670361, "learning_rate": 1.887553297818029e-05, "loss": 0.0017, "num_tokens": 21449140.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 402.875, "completions/mean_terminated_length": 402.875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.4748201438848921, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.06219977838918567, "learning_rate": 1.8874049124049662e-05, "loss": 0.0025, "num_tokens": 21457003.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.4750046116952592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.060884799575433135, "learning_rate": 1.8872564349916637e-05, "loss": 0.0024, "num_tokens": 21466860.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.4751890795056263, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.04024537093937397, "learning_rate": 1.887107865593514e-05, "loss": 0.0016, "num_tokens": 21474885.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47537354731599335, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.07099897786974907, "learning_rate": 1.8869592042259207e-05, "loss": 0.0028, "num_tokens": 21484426.0, "reward": 1.2884615659713745, "reward_std": 0.4404523968696594, "rewards/fixed_code_pass_all_test_reward/mean": 0.2884615361690521, "rewards/fixed_code_pass_all_test_reward/std": 0.44045236706733704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 469.25, "completions/mean_terminated_length": 469.25, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.4755580151263604, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.0633567743934691, "learning_rate": 1.886810450904295e-05, "loss": 0.0025, "num_tokens": 21497244.0, "reward": 1.7457627058029175, "reward_std": 0.7054944038391113, "rewards/fixed_code_pass_all_test_reward/mean": 0.8707627058029175, "rewards/fixed_code_pass_all_test_reward/std": 0.35204118490219116, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.47574248293672755, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.06306981621310115, "learning_rate": 1.8866616056440597e-05, "loss": 0.0025, "num_tokens": 21508773.0, "reward": 1.2556817531585693, "reward_std": 0.16287250816822052, "rewards/fixed_code_pass_all_test_reward/mean": 0.2556818425655365, "rewards/fixed_code_pass_all_test_reward/std": 0.16287250816822052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.4759269507470946, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.07181157660670578, "learning_rate": 1.886512668460645e-05, "loss": 0.0029, "num_tokens": 21520267.0, "reward": 0.887499988079071, "reward_std": 0.36030739545822144, "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4761114185574617, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.053099709097296, "learning_rate": 1.8863636393694926e-05, "loss": 0.0021, "num_tokens": 21530506.0, "reward": 1.0714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4762958863678288, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.05512688192538917, "learning_rate": 1.8862145183860522e-05, "loss": 0.0022, "num_tokens": 21538647.0, "reward": 1.928125023841858, "reward_std": 0.20329320430755615, "rewards/fixed_code_pass_all_test_reward/mean": 0.9281250238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.20329320430755615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4764803541781959, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.032642005826346576, "learning_rate": 1.886065305525784e-05, "loss": 0.0013, "num_tokens": 21545168.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 393.625, "completions/mean_terminated_length": 393.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.47666482198856297, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.04569119121879339, "learning_rate": 1.8859160008041573e-05, "loss": 0.0018, "num_tokens": 21555053.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4768492897989301, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.023185471072793007, "learning_rate": 1.8857666042366512e-05, "loss": 0.0009, "num_tokens": 21561681.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.47703375760929717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.025671910145319998, "learning_rate": 1.8856171158387543e-05, "loss": 0.001, "num_tokens": 21565977.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 321.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.47721822541966424, "frac_reward_zero_std": 1.0, "grad_norm": 0.265625, "kl": 0.07056557713076472, "learning_rate": 1.885467535625964e-05, "loss": 0.0028, "num_tokens": 21574953.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.47740269323003137, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.06571342574898154, "learning_rate": 1.885317863613788e-05, "loss": 0.0026, "num_tokens": 21583449.0, "reward": 1.875, "reward_std": 0.1157275140285492, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.47758716104039844, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.033105006674304605, "learning_rate": 1.885168099817743e-05, "loss": 0.0013, "num_tokens": 21591383.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 363.625, "completions/mean_terminated_length": 363.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.4777716288507655, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.029328880831599236, "learning_rate": 1.8850182442533568e-05, "loss": 0.0012, "num_tokens": 21600156.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 790.75, "completions/mean_terminated_length": 790.75, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.47795609666113265, "frac_reward_zero_std": 0.0, "grad_norm": 0.52734375, "kl": 0.013846810557879508, "learning_rate": 1.8848682969361637e-05, "loss": 0.0006, "num_tokens": 21617090.0, "reward": 1.2083333730697632, "reward_std": 1.006920576095581, "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.49601587653160095, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 380.625, "completions/mean_terminated_length": 380.625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.4781405644714997, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.03622040431946516, "learning_rate": 1.8847182578817104e-05, "loss": 0.0014, "num_tokens": 21625631.0, "reward": 1.71875, "reward_std": 0.38816189765930176, "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.4783250322818668, "frac_reward_zero_std": 1.0, "grad_norm": 0.875, "kl": 0.11300290073268116, "learning_rate": 1.884568127105552e-05, "loss": 0.0045, "num_tokens": 21632650.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.4785095000922339, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.04447584296576679, "learning_rate": 1.884417904623252e-05, "loss": 0.0018, "num_tokens": 21638633.0, "reward": 1.0612244606018066, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.06122449040412903, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.478693967902601, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.0870233066380024, "learning_rate": 1.8842675904503855e-05, "loss": 0.0035, "num_tokens": 21644816.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.47887843571296806, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.08226994518190622, "learning_rate": 1.8841171846025353e-05, "loss": 0.0033, "num_tokens": 21649315.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.4790629035233352, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.068166937911883, "learning_rate": 1.883966687095295e-05, "loss": 0.0027, "num_tokens": 21656883.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 392.625, "completions/mean_terminated_length": 392.625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.47924737133370227, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.038940017111599445, "learning_rate": 1.8838160979442675e-05, "loss": 0.0016, "num_tokens": 21665072.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.47943183914406934, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.0648960149846971, "learning_rate": 1.883665417165064e-05, "loss": 0.0026, "num_tokens": 21672630.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.47961630695443647, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.036986818769946694, "learning_rate": 1.883514644773307e-05, "loss": 0.0015, "num_tokens": 21683542.0, "reward": 1.4036458730697632, "reward_std": 0.0989387258887291, "rewards/fixed_code_pass_all_test_reward/mean": 0.4036458134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.09893874824047089, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.47980077476480354, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.048862273804843426, "learning_rate": 1.8833637807846266e-05, "loss": 0.002, "num_tokens": 21688903.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.4799852425751706, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.029547258280217648, "learning_rate": 1.883212825214664e-05, "loss": 0.0012, "num_tokens": 21693887.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.48016971038553774, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.08336235024034977, "learning_rate": 1.8830617780790694e-05, "loss": 0.0033, "num_tokens": 21702599.0, "reward": 1.7827380895614624, "reward_std": 0.17995494604110718, "rewards/fixed_code_pass_all_test_reward/mean": 0.7827380895614624, "rewards/fixed_code_pass_all_test_reward/std": 0.17995496094226837, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.4803541781959048, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.051262628054246306, "learning_rate": 1.8829106393935016e-05, "loss": 0.0021, "num_tokens": 21706760.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4805386460062719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576171875, "kl": 0.03332006628625095, "learning_rate": 1.8827594091736307e-05, "loss": 0.0013, "num_tokens": 21711160.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.480723113816639, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.06753552565351129, "learning_rate": 1.8826080874351343e-05, "loss": 0.0027, "num_tokens": 21717845.0, "reward": 1.1612902879714966, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.16129031777381897, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4809075816270061, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.031605039490386844, "learning_rate": 1.8824566741937012e-05, "loss": 0.0013, "num_tokens": 21727823.0, "reward": 1.5, "reward_std": 0.14304131269454956, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.1430412381887436, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.48109204943737316, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.021539915003813803, "learning_rate": 1.8823051694650282e-05, "loss": 0.0009, "num_tokens": 21731668.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 500.0, "completions/mean_terminated_length": 278.8571472167969, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.4812765172477403, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.06381104308820795, "learning_rate": 1.8821535732648227e-05, "loss": 0.0026, "num_tokens": 21739964.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.48146098505810736, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.06739085214212537, "learning_rate": 1.8820018856088013e-05, "loss": 0.0027, "num_tokens": 21747042.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.48164545286847443, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.06153931561857462, "learning_rate": 1.88185010651269e-05, "loss": 0.0025, "num_tokens": 21757191.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.48182992067884156, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.07224446663167328, "learning_rate": 1.8816982359922236e-05, "loss": 0.0029, "num_tokens": 21767052.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.48201438848920863, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.07759394031018019, "learning_rate": 1.881546274063148e-05, "loss": 0.0031, "num_tokens": 21778697.0, "reward": 1.2217742204666138, "reward_std": 0.19541148841381073, "rewards/fixed_code_pass_all_test_reward/mean": 0.2217741757631302, "rewards/fixed_code_pass_all_test_reward/std": 0.1954115331172943, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4821988562995757, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.0468183069024235, "learning_rate": 1.881394220741217e-05, "loss": 0.0019, "num_tokens": 21788897.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.48238332410994283, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.05207995884120464, "learning_rate": 1.8812420760421942e-05, "loss": 0.0021, "num_tokens": 21793448.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4825677919203099, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0718965525738895, "learning_rate": 1.8810898399818532e-05, "loss": 0.0029, "num_tokens": 21803458.0, "reward": 1.5, "reward_std": 0.4750940203666687, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4750939607620239, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 645.75, "completions/mean_terminated_length": 645.75, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.482752259730677, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.059276743326336145, "learning_rate": 1.8809375125759776e-05, "loss": 0.0024, "num_tokens": 21819176.0, "reward": 1.2041666507720947, "reward_std": 0.3470682203769684, "rewards/fixed_code_pass_all_test_reward/mean": 0.20416668057441711, "rewards/fixed_code_pass_all_test_reward/std": 0.34706825017929077, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4829367275410441, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.038110974011942744, "learning_rate": 1.8807850938403587e-05, "loss": 0.0015, "num_tokens": 21823423.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.4831211953514112, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.05661628535017371, "learning_rate": 1.8806325837907985e-05, "loss": 0.0023, "num_tokens": 21829695.0, "reward": 1.433823585510254, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.43382352590560913, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 532.0, "completions/mean_terminated_length": 315.4285888671875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.48330566316177825, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "kl": 0.05484827782493085, "learning_rate": 1.8804799824431083e-05, "loss": 0.0022, "num_tokens": 21841871.0, "reward": 1.4427082538604736, "reward_std": 0.7130228281021118, "rewards/fixed_code_pass_all_test_reward/mean": 0.5677083134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.47031813859939575, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 442.375, "completions/mean_terminated_length": 442.375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.4834901309721454, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.045658388757146895, "learning_rate": 1.880327289813109e-05, "loss": 0.0018, "num_tokens": 21851698.0, "reward": 1.6666667461395264, "reward_std": 0.17817415297031403, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.48367459878251245, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0672224098816514, "learning_rate": 1.8801745059166305e-05, "loss": 0.0027, "num_tokens": 21860387.0, "reward": 1.1029411554336548, "reward_std": 0.5567187070846558, "rewards/fixed_code_pass_all_test_reward/mean": 0.22794118523597717, "rewards/fixed_code_pass_all_test_reward/std": 0.34613537788391113, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.4838590665928795, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.05607604747638106, "learning_rate": 1.8800216307695123e-05, "loss": 0.0022, "num_tokens": 21870177.0, "reward": 1.0178570747375488, "reward_std": 0.05050762742757797, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.48404353440324666, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.06874278502073139, "learning_rate": 1.8798686643876037e-05, "loss": 0.0027, "num_tokens": 21880145.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.48422800221361373, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.050453036557883024, "learning_rate": 1.8797156067867637e-05, "loss": 0.002, "num_tokens": 21883852.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4844124700239808, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.08801545714959502, "learning_rate": 1.8795624579828593e-05, "loss": 0.0035, "num_tokens": 21892582.0, "reward": 1.6351351737976074, "reward_std": 0.5035613775253296, "rewards/fixed_code_pass_all_test_reward/mean": 0.6351351737976074, "rewards/fixed_code_pass_all_test_reward/std": 0.5035613775253296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.48459693783434793, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.048155746422708035, "learning_rate": 1.8794092179917687e-05, "loss": 0.0019, "num_tokens": 21899595.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.484781405644715, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.06102281576022506, "learning_rate": 1.879255886829378e-05, "loss": 0.0024, "num_tokens": 21903954.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4849658734550821, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.10516053065657616, "learning_rate": 1.879102464511585e-05, "loss": 0.0042, "num_tokens": 21912524.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 505.5, "completions/mean_terminated_length": 505.5, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.4851503412654492, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.05615597194992006, "learning_rate": 1.878948951054294e-05, "loss": 0.0022, "num_tokens": 21921848.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4853348090758163, "frac_reward_zero_std": 1.0, "grad_norm": 0.357421875, "kl": 0.06627262197434902, "learning_rate": 1.878795346473421e-05, "loss": 0.0027, "num_tokens": 21925962.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.48551927688618335, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.06143044214695692, "learning_rate": 1.8786416507848904e-05, "loss": 0.0025, "num_tokens": 21934082.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.4857037446965505, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.06379149632994086, "learning_rate": 1.8784878640046366e-05, "loss": 0.0026, "num_tokens": 21938359.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.48588821250691755, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.11808006651699543, "learning_rate": 1.8783339861486033e-05, "loss": 0.0047, "num_tokens": 21946057.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.4860726803172846, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.06879752920940518, "learning_rate": 1.8781800172327434e-05, "loss": 0.0028, "num_tokens": 21952371.0, "reward": 1.83695650100708, "reward_std": 0.2250213772058487, "rewards/fixed_code_pass_all_test_reward/mean": 0.8369565010070801, "rewards/fixed_code_pass_all_test_reward/std": 0.2250213921070099, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.48625714812765175, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.06551646068692207, "learning_rate": 1.8780259572730192e-05, "loss": 0.0026, "num_tokens": 21959270.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4864416159380188, "frac_reward_zero_std": 1.0, "grad_norm": 0.21484375, "kl": 0.09365212917327881, "learning_rate": 1.8778718062854025e-05, "loss": 0.0037, "num_tokens": 21967357.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4866260837483859, "frac_reward_zero_std": 1.0, "grad_norm": 0.166015625, "kl": 0.07621316704899073, "learning_rate": 1.877717564285875e-05, "loss": 0.003, "num_tokens": 21973271.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.486810551558753, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.04965034918859601, "learning_rate": 1.8775632312904272e-05, "loss": 0.002, "num_tokens": 21982552.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4869950193691201, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.07729699392803013, "learning_rate": 1.87740880731506e-05, "loss": 0.0031, "num_tokens": 21987213.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.48717948717948717, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.06013509491458535, "learning_rate": 1.877254292375782e-05, "loss": 0.0024, "num_tokens": 21999343.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.4873639549898543, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.027457835152745247, "learning_rate": 1.877099686488613e-05, "loss": 0.0011, "num_tokens": 22005673.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.48754842280022137, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.08892476977780461, "learning_rate": 1.8769449896695815e-05, "loss": 0.0036, "num_tokens": 22014396.0, "reward": 1.875, "reward_std": 0.14608041942119598, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.1460804045200348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.48773289061058844, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.04619123344309628, "learning_rate": 1.8767902019347248e-05, "loss": 0.0018, "num_tokens": 22018423.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 398.875, "completions/mean_terminated_length": 398.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.4879173584209555, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.08320645708590746, "learning_rate": 1.876635323300091e-05, "loss": 0.0033, "num_tokens": 22028726.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 419.0, "completions/mean_terminated_length": 419.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.48810182623132264, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.0428480498958379, "learning_rate": 1.8764803537817368e-05, "loss": 0.0017, "num_tokens": 22040734.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4882862940416897, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.07826476637274027, "learning_rate": 1.876325293395728e-05, "loss": 0.0031, "num_tokens": 22047567.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.5, "completions/mean_terminated_length": 113.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.4884707618520568, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.04317105608060956, "learning_rate": 1.8761701421581403e-05, "loss": 0.0017, "num_tokens": 22051403.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.4886552296624239, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.05084071250166744, "learning_rate": 1.876014900085059e-05, "loss": 0.002, "num_tokens": 22061123.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 389.125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.488839697472791, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.05006074230186641, "learning_rate": 1.8758595671925785e-05, "loss": 0.002, "num_tokens": 22069356.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.48902416528315806, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.04425408411771059, "learning_rate": 1.875704143496803e-05, "loss": 0.0018, "num_tokens": 22075998.0, "reward": 1.082446813583374, "reward_std": 0.08274652063846588, "rewards/fixed_code_pass_all_test_reward/mean": 0.08244681358337402, "rewards/fixed_code_pass_all_test_reward/std": 0.08274653553962708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 409.875, "completions/mean_terminated_length": 409.875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.4892086330935252, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.07869835942983627, "learning_rate": 1.8755486290138446e-05, "loss": 0.0031, "num_tokens": 22083733.0, "reward": 1.64673912525177, "reward_std": 0.12821611762046814, "rewards/fixed_code_pass_all_test_reward/mean": 0.64673912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.12821611762046814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.48939310090389226, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0628776231314987, "learning_rate": 1.8753930237598273e-05, "loss": 0.0025, "num_tokens": 22092944.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 613.25, "completions/mean_terminated_length": 613.25, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.48957756871425934, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.02920834848191589, "learning_rate": 1.8752373277508827e-05, "loss": 0.0012, "num_tokens": 22113202.0, "reward": 1.6572580337524414, "reward_std": 0.4796862006187439, "rewards/fixed_code_pass_all_test_reward/mean": 0.7822580337524414, "rewards/fixed_code_pass_all_test_reward/std": 0.4090364873409271, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 266.5, "completions/mean_terminated_length": 266.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.48976203652462647, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.08292871667072177, "learning_rate": 1.8750815410031527e-05, "loss": 0.0033, "num_tokens": 22123230.0, "reward": 1.7805233001708984, "reward_std": 0.41159871220588684, "rewards/fixed_code_pass_all_test_reward/mean": 0.7805232405662537, "rewards/fixed_code_pass_all_test_reward/std": 0.41159871220588684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.48994650433499354, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.07881693355739117, "learning_rate": 1.8749256635327877e-05, "loss": 0.0032, "num_tokens": 22128475.0, "reward": 1.329545497894287, "reward_std": 0.13690368831157684, "rewards/fixed_code_pass_all_test_reward/mean": 0.3295454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.13690368831157684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.4901309721453606, "frac_reward_zero_std": 1.0, "grad_norm": 0.03369140625, "kl": 0.019764596479944885, "learning_rate": 1.8747696953559483e-05, "loss": 0.0008, "num_tokens": 22134569.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 396.0, "completions/mean_terminated_length": 396.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.49031543995572774, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.053213195176795125, "learning_rate": 1.874613636488804e-05, "loss": 0.0021, "num_tokens": 22142153.0, "reward": 1.6634615659713745, "reward_std": 0.4740743041038513, "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.40023237466812134, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 391.0, "completions/mean_terminated_length": 391.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.4904999077660948, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.058236134704202414, "learning_rate": 1.8744574869475345e-05, "loss": 0.0023, "num_tokens": 22151105.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 280.875, "completions/mean_terminated_length": 280.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4906843755764619, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.08595698280259967, "learning_rate": 1.8743012467483277e-05, "loss": 0.0034, "num_tokens": 22160064.0, "reward": 1.44921875, "reward_std": 0.2608886957168579, "rewards/fixed_code_pass_all_test_reward/mean": 0.44921875, "rewards/fixed_code_pass_all_test_reward/std": 0.2608887255191803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.490868843386829, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.05344819067977369, "learning_rate": 1.8741449159073818e-05, "loss": 0.0021, "num_tokens": 22166683.0, "reward": 1.767241358757019, "reward_std": 0.38519835472106934, "rewards/fixed_code_pass_all_test_reward/mean": 0.767241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.38519835472106934, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.4910533111971961, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.05596211808733642, "learning_rate": 1.8739884944409044e-05, "loss": 0.0022, "num_tokens": 22174471.0, "reward": 1.5, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.49123777900756316, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.04104274197015911, "learning_rate": 1.873831982365112e-05, "loss": 0.0016, "num_tokens": 22182777.0, "reward": 1.8333333730697632, "reward_std": 0.17817412316799164, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 446.375, "completions/mean_terminated_length": 446.375, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.4914222468179303, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04835917765740305, "learning_rate": 1.8736753796962307e-05, "loss": 0.0019, "num_tokens": 22195556.0, "reward": 1.7872023582458496, "reward_std": 0.004209025297313929, "rewards/fixed_code_pass_all_test_reward/mean": 0.7872023582458496, "rewards/fixed_code_pass_all_test_reward/std": 0.004208974074572325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.49160671462829736, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06592613132670522, "learning_rate": 1.8735186864504958e-05, "loss": 0.0026, "num_tokens": 22204170.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.49179118243866443, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.08524602698162198, "learning_rate": 1.873361902644153e-05, "loss": 0.0034, "num_tokens": 22211321.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 418.75, "completions/mean_terminated_length": 418.75, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.49197565024903156, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.030290422961115837, "learning_rate": 1.8732050282934557e-05, "loss": 0.0012, "num_tokens": 22219983.0, "reward": 1.024999976158142, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 425.875, "completions/mean_terminated_length": 425.875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.49216011805939863, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.04072719987016171, "learning_rate": 1.873048063414668e-05, "loss": 0.0016, "num_tokens": 22231966.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4923445858697657, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.0708764863666147, "learning_rate": 1.872891008024063e-05, "loss": 0.0028, "num_tokens": 22239565.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 556.125, "completions/mean_terminated_length": 556.125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.49252905368013283, "frac_reward_zero_std": 1.0, "grad_norm": 0.6171875, "kl": 0.06992996204644442, "learning_rate": 1.8727338621379233e-05, "loss": 0.0028, "num_tokens": 22252582.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4927135214904999, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.05021062935702503, "learning_rate": 1.8725766257725403e-05, "loss": 0.002, "num_tokens": 22258825.0, "reward": 1.9663461446762085, "reward_std": 0.0951874852180481, "rewards/fixed_code_pass_all_test_reward/mean": 0.9663461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.09518745541572571, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 370.375, "completions/mean_terminated_length": 370.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.492897989300867, "frac_reward_zero_std": 1.0, "grad_norm": 0.056884765625, "kl": 0.03130536514800042, "learning_rate": 1.8724192989442155e-05, "loss": 0.0013, "num_tokens": 22265516.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.4930824571112341, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.06410781713202596, "learning_rate": 1.8722618816692598e-05, "loss": 0.0026, "num_tokens": 22269854.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 584.125, "completions/mean_terminated_length": 584.125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.4932669249216012, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.04553838772699237, "learning_rate": 1.8721043739639927e-05, "loss": 0.0018, "num_tokens": 22285527.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.49345139273196825, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.023106445791199803, "learning_rate": 1.8719467758447435e-05, "loss": 0.0009, "num_tokens": 22292680.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4936358605423354, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.0685864114202559, "learning_rate": 1.8717890873278512e-05, "loss": 0.0027, "num_tokens": 22302086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 474.875, "completions/mean_terminated_length": 474.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.49382032835270245, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125, "kl": 0.047360674012452364, "learning_rate": 1.871631308429664e-05, "loss": 0.0019, "num_tokens": 22308861.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4940047961630695, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.08060811553150415, "learning_rate": 1.871473439166539e-05, "loss": 0.0032, "num_tokens": 22322808.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.49418926397343665, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.056226518703624606, "learning_rate": 1.871315479554843e-05, "loss": 0.0022, "num_tokens": 22332698.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.4943737317838037, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.02856296324171126, "learning_rate": 1.871157429610953e-05, "loss": 0.0011, "num_tokens": 22340163.0, "reward": 1.7232143878936768, "reward_std": 0.19631260633468628, "rewards/fixed_code_pass_all_test_reward/mean": 0.7232142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.19631259143352509, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 370.375, "completions/mean_terminated_length": 370.375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.4945581995941708, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.058237980119884014, "learning_rate": 1.8709992893512535e-05, "loss": 0.0023, "num_tokens": 22347654.0, "reward": 1.8571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.49474266740453793, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.07689151540398598, "learning_rate": 1.87084105879214e-05, "loss": 0.0031, "num_tokens": 22356111.0, "reward": 1.2337963581085205, "reward_std": 0.12557433545589447, "rewards/fixed_code_pass_all_test_reward/mean": 0.23379631340503693, "rewards/fixed_code_pass_all_test_reward/std": 0.12557432055473328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.494927135214905, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.06833068933337927, "learning_rate": 1.8706827379500172e-05, "loss": 0.0027, "num_tokens": 22360270.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4951116030252721, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.052572570042684674, "learning_rate": 1.8705243268412977e-05, "loss": 0.0021, "num_tokens": 22366507.0, "reward": 1.8068182468414307, "reward_std": 0.1347913146018982, "rewards/fixed_code_pass_all_test_reward/mean": 0.8068181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.1347913295030594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4952960708356392, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.037722908426076174, "learning_rate": 1.8703658254824052e-05, "loss": 0.0015, "num_tokens": 22371562.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.4954805386460063, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.051082147285342216, "learning_rate": 1.8702072338897725e-05, "loss": 0.002, "num_tokens": 22379389.0, "reward": 1.0078125, "reward_std": 0.022097086533904076, "rewards/fixed_code_pass_all_test_reward/mean": 0.0078125, "rewards/fixed_code_pass_all_test_reward/std": 0.022097086533904076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.49566500645637335, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.037178396014496684, "learning_rate": 1.87004855207984e-05, "loss": 0.0015, "num_tokens": 22389458.0, "reward": 1.6720588207244873, "reward_std": 0.09566734731197357, "rewards/fixed_code_pass_all_test_reward/mean": 0.6720588207244873, "rewards/fixed_code_pass_all_test_reward/std": 0.09566739946603775, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.4958494742667405, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.029861239483579993, "learning_rate": 1.86988978006906e-05, "loss": 0.0012, "num_tokens": 22396219.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.49603394207710755, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.031399325118400156, "learning_rate": 1.8697309178738923e-05, "loss": 0.0013, "num_tokens": 22400727.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4962184098874746, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.053763410774990916, "learning_rate": 1.8695719655108068e-05, "loss": 0.0022, "num_tokens": 22408335.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 377.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.49640287769784175, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.05716722644865513, "learning_rate": 1.869412922996283e-05, "loss": 0.0023, "num_tokens": 22419886.0, "reward": 1.3776042461395264, "reward_std": 0.1653360277414322, "rewards/fixed_code_pass_all_test_reward/mean": 0.3776041865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.1653360277414322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 449.75, "completions/mean_terminated_length": 449.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4965873455082088, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.04980372806312516, "learning_rate": 1.8692537903468085e-05, "loss": 0.002, "num_tokens": 22433196.0, "reward": 1.921875, "reward_std": 0.0646936446428299, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.06469365209341049, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.4967718133185759, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.05375754344277084, "learning_rate": 1.869094567578882e-05, "loss": 0.0022, "num_tokens": 22437601.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 457.0, "completions/mean_terminated_length": 457.0, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.496956281128943, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.06520343385636806, "learning_rate": 1.86893525470901e-05, "loss": 0.0026, "num_tokens": 22451169.0, "reward": 1.5654761791229248, "reward_std": 0.15573996305465698, "rewards/fixed_code_pass_all_test_reward/mean": 0.5654761791229248, "rewards/fixed_code_pass_all_test_reward/std": 0.15573999285697937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 355.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4971407489393101, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.03270853543654084, "learning_rate": 1.86877585175371e-05, "loss": 0.0013, "num_tokens": 22461629.0, "reward": 1.852150559425354, "reward_std": 0.3175621032714844, "rewards/fixed_code_pass_all_test_reward/mean": 0.852150559425354, "rewards/fixed_code_pass_all_test_reward/std": 0.317562073469162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 368.875, "completions/mean_terminated_length": 368.875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.49732521674967717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.03567098663188517, "learning_rate": 1.8686163587295064e-05, "loss": 0.0014, "num_tokens": 22468332.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4975096845600443, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.03171967202797532, "learning_rate": 1.8684567756529354e-05, "loss": 0.0013, "num_tokens": 22473119.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 266.5, "completions/mean_terminated_length": 266.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.49769415237041137, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.07879601046442986, "learning_rate": 1.8682971025405413e-05, "loss": 0.0032, "num_tokens": 22479475.0, "reward": 1.53125, "reward_std": 0.4078085720539093, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.3735266327857971, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 443.0, "completions/mean_terminated_length": 443.0, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.49787862018077844, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.025188424857333302, "learning_rate": 1.868137339408878e-05, "loss": 0.001, "num_tokens": 22488027.0, "reward": 1.2000000476837158, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.49806308799114557, "frac_reward_zero_std": 0.0, "grad_norm": 0.609375, "kl": 0.03303690068423748, "learning_rate": 1.8679774862745082e-05, "loss": 0.0013, "num_tokens": 22493967.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.49824755580151264, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.03930973447859287, "learning_rate": 1.8678175431540053e-05, "loss": 0.0016, "num_tokens": 22498126.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4984320236118797, "frac_reward_zero_std": 1.0, "grad_norm": 0.2333984375, "kl": 0.0703016750048846, "learning_rate": 1.86765751006395e-05, "loss": 0.0028, "num_tokens": 22507498.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.49861649142224684, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.06452929903753102, "learning_rate": 1.8674973870209344e-05, "loss": 0.0026, "num_tokens": 22512158.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.4988009592326139, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.017709423671476543, "learning_rate": 1.8673371740415586e-05, "loss": 0.0007, "num_tokens": 22518378.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.498985427042981, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.07703635562211275, "learning_rate": 1.8671768711424326e-05, "loss": 0.0031, "num_tokens": 22526774.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4991698948533481, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.04877238115295768, "learning_rate": 1.8670164783401753e-05, "loss": 0.002, "num_tokens": 22535587.0, "reward": 1.9425675868988037, "reward_std": 0.16244345903396606, "rewards/fixed_code_pass_all_test_reward/mean": 0.9425675868988037, "rewards/fixed_code_pass_all_test_reward/std": 0.16244345903396606, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.4993543626637152, "frac_reward_zero_std": 0.0, "grad_norm": 0.671875, "kl": 0.030382052063941956, "learning_rate": 1.8668559956514155e-05, "loss": 0.0012, "num_tokens": 22542307.0, "reward": 1.921875, "reward_std": 0.22097086906433105, "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 288.0, "completions/mean_terminated_length": 288.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.49953883047408226, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.06432184716686606, "learning_rate": 1.8666954230927904e-05, "loss": 0.0026, "num_tokens": 22548715.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.4997232982844494, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.051102206809446216, "learning_rate": 1.8665347606809475e-05, "loss": 0.002, "num_tokens": 22561720.0, "reward": 1.454741358757019, "reward_std": 0.7262423634529114, "rewards/fixed_code_pass_all_test_reward/mean": 0.579741358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.4866082966327667, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.49990776609481646, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.03672591829672456, "learning_rate": 1.866374008432543e-05, "loss": 0.0015, "num_tokens": 22569538.0, "reward": 1.968181848526001, "reward_std": 0.08999539911746979, "rewards/fixed_code_pass_all_test_reward/mean": 0.968181848526001, "rewards/fixed_code_pass_all_test_reward/std": 0.08999541401863098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5000922339051835, "frac_reward_zero_std": 0.0, "grad_norm": 4.6875, "kl": 0.2124472400173545, "learning_rate": 1.8662131663642437e-05, "loss": 0.0085, "num_tokens": 22576156.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.5002767017155506, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.0659384629689157, "learning_rate": 1.866052234492723e-05, "loss": 0.0026, "num_tokens": 22585643.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5004611695259177, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.028452370315790176, "learning_rate": 1.865891212834666e-05, "loss": 0.0011, "num_tokens": 22590521.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 428.0, "completions/mean_terminated_length": 428.0, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.5006456373362849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0595703125, "kl": 0.028288972564041615, "learning_rate": 1.8657301014067663e-05, "loss": 0.0011, "num_tokens": 22602601.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5008301051466519, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.057589816860854626, "learning_rate": 1.865568900225727e-05, "loss": 0.0023, "num_tokens": 22610920.0, "reward": 1.7857143878936768, "reward_std": 0.3926767408847809, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.23389144241809845, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.501014572957019, "frac_reward_zero_std": 1.0, "grad_norm": 0.421875, "kl": 0.06050415366189554, "learning_rate": 1.86540760930826e-05, "loss": 0.0024, "num_tokens": 22614505.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 377.875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5011990407673861, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.023827393422834575, "learning_rate": 1.865246228671088e-05, "loss": 0.001, "num_tokens": 22621856.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 603.125, "completions/mean_terminated_length": 603.125, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.5013835085777532, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.02836527058389038, "learning_rate": 1.86508475833094e-05, "loss": 0.0011, "num_tokens": 22639273.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5015679763881202, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.060538227669894695, "learning_rate": 1.864923198304558e-05, "loss": 0.0024, "num_tokens": 22643601.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5017524441984874, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.0789107196033001, "learning_rate": 1.8647615486086898e-05, "loss": 0.0032, "num_tokens": 22651477.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5019369120088545, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.053563714027404785, "learning_rate": 1.8645998092600957e-05, "loss": 0.0021, "num_tokens": 22657691.0, "reward": 1.5625, "reward_std": 0.04115033894777298, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.04115033894777298, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5021213798192216, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.04365982650779188, "learning_rate": 1.8644379802755428e-05, "loss": 0.0017, "num_tokens": 22666586.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5023058476295886, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.045669612009078264, "learning_rate": 1.8642760616718086e-05, "loss": 0.0018, "num_tokens": 22677809.0, "reward": 0.9130434393882751, "reward_std": 0.3797464668750763, "rewards/fixed_code_pass_all_test_reward/mean": 0.03804347664117813, "rewards/fixed_code_pass_all_test_reward/std": 0.0913117453455925, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.5024903154399557, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.017441502306610346, "learning_rate": 1.8641140534656798e-05, "loss": 0.0007, "num_tokens": 22684783.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5026747832503228, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.06818964099511504, "learning_rate": 1.863951955673953e-05, "loss": 0.0027, "num_tokens": 22693164.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.50285925106069, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.053072353824973106, "learning_rate": 1.863789768313432e-05, "loss": 0.0021, "num_tokens": 22697522.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.503043718871057, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.07619306957349181, "learning_rate": 1.8636274914009325e-05, "loss": 0.003, "num_tokens": 22703870.0, "reward": 1.0535714626312256, "reward_std": 0.0739356130361557, "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5032281866814241, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.06275647808797657, "learning_rate": 1.8634651249532778e-05, "loss": 0.0025, "num_tokens": 22712718.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5034126544917912, "frac_reward_zero_std": 1.0, "grad_norm": 0.033203125, "kl": 0.011306561005767435, "learning_rate": 1.863302668987301e-05, "loss": 0.0005, "num_tokens": 22721429.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5035971223021583, "frac_reward_zero_std": 1.0, "grad_norm": 0.447265625, "kl": 0.08814415335655212, "learning_rate": 1.863140123519845e-05, "loss": 0.0035, "num_tokens": 22729812.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5037815901125253, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.06425468460656703, "learning_rate": 1.8629774885677604e-05, "loss": 0.0026, "num_tokens": 22737329.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5039660579228925, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.049988374346867204, "learning_rate": 1.8628147641479092e-05, "loss": 0.002, "num_tokens": 22744351.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.5041505257332596, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.03743119747377932, "learning_rate": 1.8626519502771606e-05, "loss": 0.0015, "num_tokens": 22752394.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5043349935436267, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.04011024418286979, "learning_rate": 1.862489046972395e-05, "loss": 0.0016, "num_tokens": 22756978.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5045194613539937, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.035857048351317644, "learning_rate": 1.8623260542505005e-05, "loss": 0.0014, "num_tokens": 22763336.0, "reward": 1.149999976158142, "reward_std": 0.028284244239330292, "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.02828427031636238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 341.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5047039291643608, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.05145525140687823, "learning_rate": 1.8621629721283748e-05, "loss": 0.0021, "num_tokens": 22771085.0, "reward": 1.671875, "reward_std": 0.4565373361110687, "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, "rewards/fixed_code_pass_all_test_reward/std": 0.4565373361110687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.5048883969747279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.04882565513253212, "learning_rate": 1.8619998006229262e-05, "loss": 0.002, "num_tokens": 22780205.0, "reward": 1.3571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.505072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.051930279238149524, "learning_rate": 1.8618365397510704e-05, "loss": 0.0021, "num_tokens": 22794657.0, "reward": 1.6171875, "reward_std": 0.22018027305603027, "rewards/fixed_code_pass_all_test_reward/mean": 0.6171875, "rewards/fixed_code_pass_all_test_reward/std": 0.22018027305603027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5052573325954621, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.029758159071207047, "learning_rate": 1.8616731895297338e-05, "loss": 0.0012, "num_tokens": 22803214.0, "reward": 1.9791666269302368, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5054418004058292, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.07370346807874739, "learning_rate": 1.8615097499758508e-05, "loss": 0.0029, "num_tokens": 22810772.0, "reward": 1.5472973585128784, "reward_std": 0.19794942438602448, "rewards/fixed_code_pass_all_test_reward/mean": 0.5472972393035889, "rewards/fixed_code_pass_all_test_reward/std": 0.19794940948486328, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5056262682161963, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.0553805495146662, "learning_rate": 1.8613462211063663e-05, "loss": 0.0022, "num_tokens": 22820550.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5058107360265633, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.05754216713830829, "learning_rate": 1.8611826029382334e-05, "loss": 0.0023, "num_tokens": 22830087.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.5059952038369304, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.03508240572409704, "learning_rate": 1.8610188954884152e-05, "loss": 0.0014, "num_tokens": 22834359.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5061796716472975, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.07310201367363334, "learning_rate": 1.8608550987738838e-05, "loss": 0.0029, "num_tokens": 22842508.0, "reward": 1.4117647409439087, "reward_std": 0.4871050715446472, "rewards/fixed_code_pass_all_test_reward/mean": 0.4117647111415863, "rewards/fixed_code_pass_all_test_reward/std": 0.4871051013469696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 387.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5063641394576647, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.05301804514601827, "learning_rate": 1.860691212811621e-05, "loss": 0.0021, "num_tokens": 22852632.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5065486072680317, "frac_reward_zero_std": 0.0, "grad_norm": 0.66796875, "kl": 0.048013882245868444, "learning_rate": 1.8605272376186163e-05, "loss": 0.0019, "num_tokens": 22858558.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5067330750783988, "frac_reward_zero_std": 1.0, "grad_norm": 0.049560546875, "kl": 0.03405846678651869, "learning_rate": 1.8603631732118705e-05, "loss": 0.0014, "num_tokens": 22864837.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5069175428887659, "frac_reward_zero_std": 1.0, "grad_norm": 1.421875, "kl": 0.1853981940075755, "learning_rate": 1.8601990196083924e-05, "loss": 0.0074, "num_tokens": 22868735.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.507102010699133, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.05060426262207329, "learning_rate": 1.8600347768252006e-05, "loss": 0.002, "num_tokens": 22873155.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5072864785095, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.07958096358925104, "learning_rate": 1.859870444879322e-05, "loss": 0.0032, "num_tokens": 22876990.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 545.125, "completions/mean_terminated_length": 545.125, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.5074709463198672, "frac_reward_zero_std": 1.0, "grad_norm": 0.037353515625, "kl": 0.023994793533347547, "learning_rate": 1.8597060237877944e-05, "loss": 0.001, "num_tokens": 22887279.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5076554141302343, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.04975181689951569, "learning_rate": 1.859541513567663e-05, "loss": 0.002, "num_tokens": 22894538.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 672.75, "completions/mean_terminated_length": 672.75, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.5078398819406014, "frac_reward_zero_std": 0.0, "grad_norm": 0.6015625, "kl": 0.041624399134889245, "learning_rate": 1.8593769142359838e-05, "loss": 0.0017, "num_tokens": 22911776.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5080243497509684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.055431193206459284, "learning_rate": 1.8592122258098208e-05, "loss": 0.0022, "num_tokens": 22922117.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 213.25, "completions/mean_terminated_length": 213.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5082088175613355, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.04109343606978655, "learning_rate": 1.8590474483062483e-05, "loss": 0.0016, "num_tokens": 22929199.0, "reward": 1.625, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5083932853717026, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.05803778977133334, "learning_rate": 1.858882581742349e-05, "loss": 0.0023, "num_tokens": 22939036.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5085777531820698, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.03114568628370762, "learning_rate": 1.8587176261352153e-05, "loss": 0.0012, "num_tokens": 22948959.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5087622209924368, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.06351406406611204, "learning_rate": 1.8585525815019485e-05, "loss": 0.0025, "num_tokens": 22957596.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.5089466888028039, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.03977635712362826, "learning_rate": 1.85838744785966e-05, "loss": 0.0016, "num_tokens": 22970363.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 459.25, "completions/mean_terminated_length": 459.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.509131156613171, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.03571559232659638, "learning_rate": 1.8582222252254692e-05, "loss": 0.0014, "num_tokens": 22984437.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5093156244235381, "frac_reward_zero_std": 1.0, "grad_norm": 0.061767578125, "kl": 0.06219602515920997, "learning_rate": 1.858056913616505e-05, "loss": 0.0025, "num_tokens": 22991061.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.5095000922339051, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.06495046429336071, "learning_rate": 1.8578915130499063e-05, "loss": 0.0026, "num_tokens": 22995215.0, "reward": 1.75, "reward_std": 0.49601584672927856, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.24800792336463928, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5096845600442723, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.058859196957200766, "learning_rate": 1.8577260235428206e-05, "loss": 0.0024, "num_tokens": 23004027.0, "reward": 1.101694941520691, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.10169491171836853, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5098690278546394, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.04777829488739371, "learning_rate": 1.857560445112405e-05, "loss": 0.0019, "num_tokens": 23013112.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 297.25, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5100534956650065, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.04993998957797885, "learning_rate": 1.857394777775825e-05, "loss": 0.002, "num_tokens": 23020074.0, "reward": 1.9711538553237915, "reward_std": 0.05341268330812454, "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.05341270938515663, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5102379634753735, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.03727330290712416, "learning_rate": 1.8572290215502567e-05, "loss": 0.0015, "num_tokens": 23028946.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5104224312857406, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.04818772594444454, "learning_rate": 1.8570631764528838e-05, "loss": 0.0019, "num_tokens": 23034727.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 379.625, "completions/mean_terminated_length": 379.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5106068990961077, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.032342077465727925, "learning_rate": 1.8568972425009e-05, "loss": 0.0013, "num_tokens": 23042140.0, "reward": 1.920454502105713, "reward_std": 0.22498849034309387, "rewards/fixed_code_pass_all_test_reward/mean": 0.9204545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5107913669064749, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.047701354371383786, "learning_rate": 1.856731219711509e-05, "loss": 0.0019, "num_tokens": 23048981.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5109758347168419, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.021630617673508823, "learning_rate": 1.8565651081019223e-05, "loss": 0.0009, "num_tokens": 23054847.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 375.625, "completions/mean_terminated_length": 375.625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.511160302527209, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.030856761848554015, "learning_rate": 1.8563989076893617e-05, "loss": 0.0012, "num_tokens": 23065700.0, "reward": 1.5892857313156128, "reward_std": 0.49744242429733276, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.49744245409965515, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 521.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.5113447703375761, "frac_reward_zero_std": 1.0, "grad_norm": 0.2470703125, "kl": 0.038683447637595236, "learning_rate": 1.856232618491057e-05, "loss": 0.0015, "num_tokens": 23080236.0, "reward": 1.4615384340286255, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4615384638309479, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 336.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5115292381479432, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.039023472694680095, "learning_rate": 1.856066240524249e-05, "loss": 0.0016, "num_tokens": 23086077.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5117137059583102, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.04408479807898402, "learning_rate": 1.855899773806186e-05, "loss": 0.0018, "num_tokens": 23091118.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5118981737686774, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.048626302159391344, "learning_rate": 1.8557332183541262e-05, "loss": 0.0019, "num_tokens": 23097162.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 457.125, "completions/mean_terminated_length": 457.125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.5120826415790445, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.022265153587795794, "learning_rate": 1.855566574185337e-05, "loss": 0.0009, "num_tokens": 23106099.0, "reward": 1.8571429252624512, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3499270975589752, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5122671093894116, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.026302648475393653, "learning_rate": 1.855399841317095e-05, "loss": 0.0011, "num_tokens": 23111158.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5124515771997786, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.025299595901742578, "learning_rate": 1.855233019766686e-05, "loss": 0.001, "num_tokens": 23118084.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5126360450101457, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.046765504870563745, "learning_rate": 1.855066109551405e-05, "loss": 0.0019, "num_tokens": 23124785.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5128205128205128, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.044155846000649035, "learning_rate": 1.854899110688556e-05, "loss": 0.0018, "num_tokens": 23135306.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.51300498063088, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.05021716235205531, "learning_rate": 1.8547320231954524e-05, "loss": 0.002, "num_tokens": 23145893.0, "reward": 1.3977272510528564, "reward_std": 0.2872230112552643, "rewards/fixed_code_pass_all_test_reward/mean": 0.39772728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.2872230112552643, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.513189448441247, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.07923578098416328, "learning_rate": 1.8545648470894166e-05, "loss": 0.0032, "num_tokens": 23154612.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5133739162516141, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.05200920789502561, "learning_rate": 1.8543975823877803e-05, "loss": 0.0021, "num_tokens": 23161991.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5135583840619812, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05446542892605066, "learning_rate": 1.8542302291078846e-05, "loss": 0.0022, "num_tokens": 23173809.0, "reward": 1.7999999523162842, "reward_std": 0.38544961810112, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5137428518723482, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.07249522674828768, "learning_rate": 1.8540627872670795e-05, "loss": 0.0029, "num_tokens": 23181271.0, "reward": 1.6666667461395264, "reward_std": 0.4714045226573944, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.4714045524597168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5139273196827153, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.044770811451599, "learning_rate": 1.853895256882724e-05, "loss": 0.0018, "num_tokens": 23188322.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5141117874930825, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0689567131921649, "learning_rate": 1.8537276379721872e-05, "loss": 0.0028, "num_tokens": 23193108.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5142962553034496, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.029760813689790666, "learning_rate": 1.8535599305528463e-05, "loss": 0.0012, "num_tokens": 23198279.0, "reward": 1.912500023841858, "reward_std": 0.2474873811006546, "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5144807231138167, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.06344792759045959, "learning_rate": 1.853392134642088e-05, "loss": 0.0025, "num_tokens": 23205383.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5146651909241837, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.08982344134710729, "learning_rate": 1.8532242502573078e-05, "loss": 0.0036, "num_tokens": 23214059.0, "reward": 1.8802082538604736, "reward_std": 0.33882203698158264, "rewards/fixed_code_pass_all_test_reward/mean": 0.8802083134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.33882200717926025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 354.25, "completions/mean_terminated_length": 354.25, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.5148496587345508, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.047635109163820744, "learning_rate": 1.853056277415912e-05, "loss": 0.0019, "num_tokens": 23221813.0, "reward": 1.625, "reward_std": 0.4464142918586731, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.4464142918586731, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5150341265449179, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.07893895031884313, "learning_rate": 1.8528882161353138e-05, "loss": 0.0032, "num_tokens": 23228644.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.515218594355285, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.028219442581757903, "learning_rate": 1.8527200664329378e-05, "loss": 0.0011, "num_tokens": 23238461.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5154030621656521, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.027992335613816977, "learning_rate": 1.8525518283262153e-05, "loss": 0.0011, "num_tokens": 23247577.0, "reward": 1.9015151262283325, "reward_std": 0.2785572111606598, "rewards/fixed_code_pass_all_test_reward/mean": 0.9015151262283325, "rewards/fixed_code_pass_all_test_reward/std": 0.2785572111606598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5155875299760192, "frac_reward_zero_std": 0.0, "grad_norm": 0.455078125, "kl": 0.01824878208572045, "learning_rate": 1.852383501832589e-05, "loss": 0.0007, "num_tokens": 23254824.0, "reward": 1.984375, "reward_std": 0.04419417306780815, "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5157719977863863, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04242036887444556, "learning_rate": 1.85221508696951e-05, "loss": 0.0017, "num_tokens": 23259169.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 792.0, "completions/mean_terminated_length": 792.0, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.5159564655967533, "frac_reward_zero_std": 0.0, "grad_norm": 0.765625, "kl": 0.028247934067621827, "learning_rate": 1.852046583754438e-05, "loss": 0.0011, "num_tokens": 23277897.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 352.625, "completions/mean_terminated_length": 352.625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5161409334071204, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.045972500927746296, "learning_rate": 1.8518779922048423e-05, "loss": 0.0018, "num_tokens": 23285414.0, "reward": 1.7750000953674316, "reward_std": 0.310529500246048, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5163254012174876, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.03476559044793248, "learning_rate": 1.8517093123382014e-05, "loss": 0.0014, "num_tokens": 23293332.0, "reward": 1.0625, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 345.875, "completions/mean_terminated_length": 345.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.5165098690278547, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.06239828537218273, "learning_rate": 1.8515405441720027e-05, "loss": 0.0025, "num_tokens": 23305779.0, "reward": 1.253151297569275, "reward_std": 0.3803839087486267, "rewards/fixed_code_pass_all_test_reward/mean": 0.2531512677669525, "rewards/fixed_code_pass_all_test_reward/std": 0.3803839087486267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5166943368382217, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.048747298191301525, "learning_rate": 1.8513716877237436e-05, "loss": 0.0019, "num_tokens": 23313540.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5168788046485888, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.04933922551572323, "learning_rate": 1.8512027430109296e-05, "loss": 0.002, "num_tokens": 23322841.0, "reward": 1.807692289352417, "reward_std": 0.3560846447944641, "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.3560846745967865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 345.125, "completions/mean_terminated_length": 345.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5170632724589559, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.05617962358519435, "learning_rate": 1.851033710051076e-05, "loss": 0.0022, "num_tokens": 23334530.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.517247740269323, "frac_reward_zero_std": 1.0, "grad_norm": 0.04248046875, "kl": 0.022077973233535886, "learning_rate": 1.8508645888617065e-05, "loss": 0.0009, "num_tokens": 23339820.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 558.125, "completions/mean_terminated_length": 558.125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.5174322080796901, "frac_reward_zero_std": 0.0, "grad_norm": 0.7265625, "kl": 0.028335342882201076, "learning_rate": 1.8506953794603548e-05, "loss": 0.0011, "num_tokens": 23349805.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5176166758900572, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.05826432514004409, "learning_rate": 1.8505260818645635e-05, "loss": 0.0023, "num_tokens": 23356140.0, "reward": 1.5700757503509521, "reward_std": 0.6344894766807556, "rewards/fixed_code_pass_all_test_reward/mean": 0.6950757503509521, "rewards/fixed_code_pass_all_test_reward/std": 0.28104063868522644, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5178011437004243, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.04653330682776868, "learning_rate": 1.8503566960918838e-05, "loss": 0.0019, "num_tokens": 23364833.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5179856115107914, "frac_reward_zero_std": 1.0, "grad_norm": 0.189453125, "kl": 0.07116739777848125, "learning_rate": 1.8501872221598774e-05, "loss": 0.0028, "num_tokens": 23374248.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 351.75, "completions/mean_terminated_length": 351.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5181700793211584, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.04943715361878276, "learning_rate": 1.850017660086113e-05, "loss": 0.002, "num_tokens": 23383014.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.5183545471315255, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578125, "kl": 0.027977523859590292, "learning_rate": 1.849848009888171e-05, "loss": 0.0011, "num_tokens": 23391331.0, "reward": 1.7374999523162842, "reward_std": 0.36228445172309875, "rewards/fixed_code_pass_all_test_reward/mean": 0.7374999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.36228442192077637, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5185390149418926, "frac_reward_zero_std": 1.0, "grad_norm": 0.24609375, "kl": 0.08416365180164576, "learning_rate": 1.8496782715836386e-05, "loss": 0.0034, "num_tokens": 23396329.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5187234827522598, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.04282577778212726, "learning_rate": 1.8495084451901135e-05, "loss": 0.0017, "num_tokens": 23406719.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5189079505626268, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04288282827474177, "learning_rate": 1.849338530725202e-05, "loss": 0.0017, "num_tokens": 23429734.0, "reward": 1.9337348937988281, "reward_std": 0.12269902974367142, "rewards/fixed_code_pass_all_test_reward/mean": 0.9337349534034729, "rewards/fixed_code_pass_all_test_reward/std": 0.12269905209541321, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5190924183729939, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.05002048425376415, "learning_rate": 1.8491685282065197e-05, "loss": 0.002, "num_tokens": 23433884.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.519276886183361, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.15176389645785093, "learning_rate": 1.848998437651692e-05, "loss": 0.0061, "num_tokens": 23439579.0, "reward": 1.421875, "reward_std": 0.48152241110801697, "rewards/fixed_code_pass_all_test_reward/mean": 0.421875, "rewards/fixed_code_pass_all_test_reward/std": 0.48152244091033936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 309.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5194613539937281, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.04074770538136363, "learning_rate": 1.8488282590783517e-05, "loss": 0.0016, "num_tokens": 23446172.0, "reward": 1.808333396911621, "reward_std": 0.2653239965438843, "rewards/fixed_code_pass_all_test_reward/mean": 0.8083333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.2653239965438843, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5196458218040951, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.057164270896464586, "learning_rate": 1.8486579925041425e-05, "loss": 0.0023, "num_tokens": 23450246.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.5198302896144623, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.059136889642104506, "learning_rate": 1.848487637946716e-05, "loss": 0.0024, "num_tokens": 23458752.0, "reward": 1.3289473056793213, "reward_std": 0.2723943293094635, "rewards/fixed_code_pass_all_test_reward/mean": 0.32894736528396606, "rewards/fixed_code_pass_all_test_reward/std": 0.2723943293094635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5200147574248294, "frac_reward_zero_std": 1.0, "grad_norm": 0.2578125, "kl": 0.04421363747678697, "learning_rate": 1.8483171954237344e-05, "loss": 0.0018, "num_tokens": 23463831.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5201992252351965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.043971346342004836, "learning_rate": 1.848146664952867e-05, "loss": 0.0018, "num_tokens": 23471833.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5203836930455635, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.05586092174053192, "learning_rate": 1.8479760465517933e-05, "loss": 0.0022, "num_tokens": 23480995.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5205681608559306, "frac_reward_zero_std": 1.0, "grad_norm": 0.3359375, "kl": 0.0726253513712436, "learning_rate": 1.847805340238203e-05, "loss": 0.0029, "num_tokens": 23491478.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 341.75, "completions/mean_terminated_length": 341.75, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5207526286662977, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.0654238909482956, "learning_rate": 1.8476345460297925e-05, "loss": 0.0026, "num_tokens": 23498612.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5209370964766649, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.03805620735511184, "learning_rate": 1.847463663944269e-05, "loss": 0.0015, "num_tokens": 23505530.0, "reward": 1.8973214626312256, "reward_std": 0.2904188334941864, "rewards/fixed_code_pass_all_test_reward/mean": 0.8973214626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.2904188334941864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5211215642870319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.052640109322965145, "learning_rate": 1.8472926939993487e-05, "loss": 0.0021, "num_tokens": 23514020.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.521306032097399, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03493537427857518, "learning_rate": 1.847121636212757e-05, "loss": 0.0014, "num_tokens": 23522536.0, "reward": 1.2992424964904785, "reward_std": 0.29130449891090393, "rewards/fixed_code_pass_all_test_reward/mean": 0.29924243688583374, "rewards/fixed_code_pass_all_test_reward/std": 0.2913045287132263, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 501.75, "completions/mean_terminated_length": 501.75, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.5214904999077661, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.02579028159379959, "learning_rate": 1.8469504906022267e-05, "loss": 0.001, "num_tokens": 23532470.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5216749677181332, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.06299476441927254, "learning_rate": 1.8467792571855027e-05, "loss": 0.0025, "num_tokens": 23537995.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5218594355285002, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.053371525602415204, "learning_rate": 1.846607935980336e-05, "loss": 0.0021, "num_tokens": 23542067.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5220439033388674, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.04979613143950701, "learning_rate": 1.8464365270044884e-05, "loss": 0.002, "num_tokens": 23550193.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5222283711492345, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.050818806514143944, "learning_rate": 1.846265030275731e-05, "loss": 0.002, "num_tokens": 23560689.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5224128389596016, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.04412054060958326, "learning_rate": 1.8460934458118425e-05, "loss": 0.0018, "num_tokens": 23568941.0, "reward": 1.8461538553237915, "reward_std": 0.3513047993183136, "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.3513047993183136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5225973067699686, "frac_reward_zero_std": 1.0, "grad_norm": 0.052490234375, "kl": 0.04565321700647473, "learning_rate": 1.8459217736306125e-05, "loss": 0.0018, "num_tokens": 23576145.0, "reward": 1.076923131942749, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.07692307978868484, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5227817745803357, "frac_reward_zero_std": 1.0, "grad_norm": 0.056396484375, "kl": 0.03350291715469211, "learning_rate": 1.8457500137498385e-05, "loss": 0.0013, "num_tokens": 23581066.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5229662423907028, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.04699177551083267, "learning_rate": 1.8455781661873268e-05, "loss": 0.0019, "num_tokens": 23585583.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.52315071020107, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "kl": 0.09024004125967622, "learning_rate": 1.8454062309608946e-05, "loss": 0.0036, "num_tokens": 23591048.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.523335178011437, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.0644664082210511, "learning_rate": 1.8452342080883665e-05, "loss": 0.0026, "num_tokens": 23595376.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5235196458218041, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.06963581638410687, "learning_rate": 1.8450620975875763e-05, "loss": 0.0028, "num_tokens": 23602035.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5237041136321712, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03225951734930277, "learning_rate": 1.844889899476368e-05, "loss": 0.0013, "num_tokens": 23609778.0, "reward": 1.345588207244873, "reward_std": 0.26760777831077576, "rewards/fixed_code_pass_all_test_reward/mean": 0.34558823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.26760780811309814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5238885814425382, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.028353850822895765, "learning_rate": 1.8447176137725934e-05, "loss": 0.0011, "num_tokens": 23616226.0, "reward": 1.6022727489471436, "reward_std": 0.09642365574836731, "rewards/fixed_code_pass_all_test_reward/mean": 0.6022727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 317.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5240730492529053, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.07353658927604556, "learning_rate": 1.8445452404941136e-05, "loss": 0.0029, "num_tokens": 23627558.0, "reward": 1.875, "reward_std": 0.2749859690666199, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5242575170632725, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.04875091719441116, "learning_rate": 1.8443727796588003e-05, "loss": 0.002, "num_tokens": 23637089.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.5244419848736396, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.13551864679902792, "learning_rate": 1.844200231284532e-05, "loss": 0.0054, "num_tokens": 23645370.0, "reward": 1.893617033958435, "reward_std": 0.3008964955806732, "rewards/fixed_code_pass_all_test_reward/mean": 0.8936170339584351, "rewards/fixed_code_pass_all_test_reward/std": 0.3008965253829956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5246264526840066, "frac_reward_zero_std": 1.0, "grad_norm": 0.15625, "kl": 0.06745268451049924, "learning_rate": 1.8440275953891976e-05, "loss": 0.0027, "num_tokens": 23649833.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5248109204943737, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.06305395509116352, "learning_rate": 1.8438548719906954e-05, "loss": 0.0025, "num_tokens": 23653689.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.5249953883047408, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.03375433851033449, "learning_rate": 1.8436820611069315e-05, "loss": 0.0014, "num_tokens": 23657521.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5251798561151079, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.022012108704075217, "learning_rate": 1.843509162755822e-05, "loss": 0.0009, "num_tokens": 23662250.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.525364323925475, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04795918520539999, "learning_rate": 1.843336176955292e-05, "loss": 0.0019, "num_tokens": 23668834.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 109.25, "completions/mean_terminated_length": 109.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5255487917358421, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.04178744845557958, "learning_rate": 1.8431631037232756e-05, "loss": 0.0017, "num_tokens": 23672492.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5257332595462092, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.07368254032917321, "learning_rate": 1.8429899430777157e-05, "loss": 0.0029, "num_tokens": 23679425.0, "reward": 1.8142857551574707, "reward_std": 0.28448712825775146, "rewards/fixed_code_pass_all_test_reward/mean": 0.8142856955528259, "rewards/fixed_code_pass_all_test_reward/std": 0.28448715806007385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 316.625, "completions/mean_terminated_length": 316.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5259177273565763, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.06328038731589913, "learning_rate": 1.8428166950365645e-05, "loss": 0.0025, "num_tokens": 23690078.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5261021951669433, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.021372593473643064, "learning_rate": 1.8426433596177832e-05, "loss": 0.0009, "num_tokens": 23700256.0, "reward": 1.9680118560791016, "reward_std": 0.02648874931037426, "rewards/fixed_code_pass_all_test_reward/mean": 0.9680118560791016, "rewards/fixed_code_pass_all_test_reward/std": 0.02648872509598732, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5262866629773104, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.049210532335564494, "learning_rate": 1.8424699368393423e-05, "loss": 0.002, "num_tokens": 23711303.0, "reward": 1.807692289352417, "reward_std": 0.3560846447944641, "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.3560846745967865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5264711307876776, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.033354627434164286, "learning_rate": 1.8422964267192204e-05, "loss": 0.0013, "num_tokens": 23722093.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5266555985980447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.03479305584914982, "learning_rate": 1.8421228292754064e-05, "loss": 0.0014, "num_tokens": 23728975.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 490.75, "completions/mean_terminated_length": 490.75, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.5268400664084117, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638671875, "kl": 0.03214071015827358, "learning_rate": 1.8419491445258977e-05, "loss": 0.0013, "num_tokens": 23741989.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5270245342187788, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.02958951867185533, "learning_rate": 1.8417753724887008e-05, "loss": 0.0012, "num_tokens": 23750457.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5272090020291459, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.050456660450436175, "learning_rate": 1.8416015131818312e-05, "loss": 0.002, "num_tokens": 23754486.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.527393469839513, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.08452646480873227, "learning_rate": 1.8414275666233137e-05, "loss": 0.0034, "num_tokens": 23761658.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 326.75, "completions/mean_terminated_length": 326.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5275779376498801, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.08769943751394749, "learning_rate": 1.8412535328311813e-05, "loss": 0.0035, "num_tokens": 23771096.0, "reward": 1.9838709831237793, "reward_std": 0.029865136370062828, "rewards/fixed_code_pass_all_test_reward/mean": 0.9838709831237793, "rewards/fixed_code_pass_all_test_reward/std": 0.029865162447094917, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 377.75, "completions/mean_terminated_length": 377.75, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.5277624054602472, "frac_reward_zero_std": 1.0, "grad_norm": 0.05126953125, "kl": 0.02376266405917704, "learning_rate": 1.841079411823477e-05, "loss": 0.001, "num_tokens": 23777838.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.5279468732706143, "frac_reward_zero_std": 0.0, "grad_norm": 0.69921875, "kl": 0.023376462049782276, "learning_rate": 1.840905203618253e-05, "loss": 0.0009, "num_tokens": 23790188.0, "reward": 1.9107142686843872, "reward_std": 0.25253817439079285, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5281313410809814, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04476763564161956, "learning_rate": 1.8407309082335692e-05, "loss": 0.0018, "num_tokens": 23799877.0, "reward": 1.4319853782653809, "reward_std": 0.7195343375205994, "rewards/fixed_code_pass_all_test_reward/mean": 0.5569853186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.48331257700920105, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5283158088913484, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04008093476295471, "learning_rate": 1.8405565256874962e-05, "loss": 0.0016, "num_tokens": 23811758.0, "reward": 1.0457316637039185, "reward_std": 0.027463216334581375, "rewards/fixed_code_pass_all_test_reward/mean": 0.04573170840740204, "rewards/fixed_code_pass_all_test_reward/std": 0.027463210746645927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5285002767017155, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.03383087646216154, "learning_rate": 1.840382055998112e-05, "loss": 0.0014, "num_tokens": 23821673.0, "reward": 1.8617424964904785, "reward_std": 0.24667370319366455, "rewards/fixed_code_pass_all_test_reward/mean": 0.8617424368858337, "rewards/fixed_code_pass_all_test_reward/std": 0.24667373299598694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5286847445120827, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.03994820243678987, "learning_rate": 1.8402074991835052e-05, "loss": 0.0016, "num_tokens": 23831156.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5288692123224498, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03670158772729337, "learning_rate": 1.8400328552617723e-05, "loss": 0.0015, "num_tokens": 23838787.0, "reward": 1.6453489065170288, "reward_std": 0.19879494607448578, "rewards/fixed_code_pass_all_test_reward/mean": 0.6453487873077393, "rewards/fixed_code_pass_all_test_reward/std": 0.19879494607448578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 431.25, "completions/mean_terminated_length": 431.25, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5290536801328168, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.08484932337887585, "learning_rate": 1.839858124251019e-05, "loss": 0.0034, "num_tokens": 23849941.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.5292381479431839, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.04827801161445677, "learning_rate": 1.8396833061693607e-05, "loss": 0.0019, "num_tokens": 23855370.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.529422615753551, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.038210643688216805, "learning_rate": 1.8395084010349214e-05, "loss": 0.0015, "num_tokens": 23860295.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5296070835639181, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.06456323666498065, "learning_rate": 1.8393334088658337e-05, "loss": 0.0026, "num_tokens": 23868979.0, "reward": 1.7083332538604736, "reward_std": 0.360958456993103, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.360958456993103, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5297915513742852, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.06536012096330523, "learning_rate": 1.8391583296802397e-05, "loss": 0.0026, "num_tokens": 23873900.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 353.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.5299760191846523, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.03246226394549012, "learning_rate": 1.8389831634962907e-05, "loss": 0.0013, "num_tokens": 23880996.0, "reward": 1.8235294818878174, "reward_std": 0.11336753517389297, "rewards/fixed_code_pass_all_test_reward/mean": 0.8235294222831726, "rewards/fixed_code_pass_all_test_reward/std": 0.11336753517389297, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5301604869950194, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.037451985059306026, "learning_rate": 1.838807910332147e-05, "loss": 0.0015, "num_tokens": 23885855.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5303449548053865, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.06036352412775159, "learning_rate": 1.8386325702059767e-05, "loss": 0.0024, "num_tokens": 23890376.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 545.625, "completions/mean_terminated_length": 545.625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.5305294226157535, "frac_reward_zero_std": 0.0, "grad_norm": 0.6875, "kl": 0.020776367862708867, "learning_rate": 1.838457143135959e-05, "loss": 0.0008, "num_tokens": 23899645.0, "reward": 1.9285714626312256, "reward_std": 0.13225999474525452, "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.1322600096464157, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5307138904261206, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.06739483680576086, "learning_rate": 1.8382816291402803e-05, "loss": 0.0027, "num_tokens": 23908711.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 416.75, "completions/mean_terminated_length": 416.75, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5308983582364877, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.06516665406525135, "learning_rate": 1.838106028237137e-05, "loss": 0.0026, "num_tokens": 23919445.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5310828260468549, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.061262635281309485, "learning_rate": 1.8379303404447343e-05, "loss": 0.0025, "num_tokens": 23929648.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 249.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5312672938572219, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.05810755817219615, "learning_rate": 1.837754565781286e-05, "loss": 0.0023, "num_tokens": 23935080.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.531451761667589, "frac_reward_zero_std": 1.0, "grad_norm": 0.052490234375, "kl": 0.02686466625891626, "learning_rate": 1.8375787042650154e-05, "loss": 0.0011, "num_tokens": 23940249.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 234.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5316362294779561, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.032278833677992225, "learning_rate": 1.837402755914155e-05, "loss": 0.0013, "num_tokens": 23945192.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 526.125, "completions/mean_terminated_length": 526.125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.5318206972883232, "frac_reward_zero_std": 0.0, "grad_norm": 0.66015625, "kl": 0.02161016664467752, "learning_rate": 1.8372267207469458e-05, "loss": 0.0009, "num_tokens": 23955473.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 435.375, "completions/mean_terminated_length": 435.375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.5320051650986902, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.04296040034387261, "learning_rate": 1.837050598781638e-05, "loss": 0.0017, "num_tokens": 23964236.0, "reward": 1.892045497894287, "reward_std": 0.2874155640602112, "rewards/fixed_code_pass_all_test_reward/mean": 0.8920454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.28741559386253357, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 423.25, "completions/mean_terminated_length": 191.1428680419922, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5321896329090574, "frac_reward_zero_std": 0.0, "grad_norm": 0.51171875, "kl": 0.0417083139764145, "learning_rate": 1.83687439003649e-05, "loss": 0.0017, "num_tokens": 23974910.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5323741007194245, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.07393796229735017, "learning_rate": 1.836698094529771e-05, "loss": 0.003, "num_tokens": 23981957.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5325585685297916, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.05093437968753278, "learning_rate": 1.8365217122797573e-05, "loss": 0.002, "num_tokens": 23989329.0, "reward": 1.7272727489471436, "reward_std": 0.16833092272281647, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5327430363401586, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0502249994315207, "learning_rate": 1.8363452433047356e-05, "loss": 0.002, "num_tokens": 23996339.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5329275041505257, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.05096303578466177, "learning_rate": 1.8361686876230013e-05, "loss": 0.002, "num_tokens": 24006506.0, "reward": 1.0416667461395264, "reward_std": 0.11785116046667099, "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, "rewards/fixed_code_pass_all_test_reward/std": 0.1178511455655098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5331119719608928, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.045845608692616224, "learning_rate": 1.8359920452528577e-05, "loss": 0.0018, "num_tokens": 24016800.0, "reward": 1.8948863744735718, "reward_std": 0.2820216715335846, "rewards/fixed_code_pass_all_test_reward/mean": 0.8948863744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.282021701335907, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.53329643977126, "frac_reward_zero_std": 1.0, "grad_norm": 0.2236328125, "kl": 0.07154117804020643, "learning_rate": 1.8358153162126183e-05, "loss": 0.0029, "num_tokens": 24022093.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.533480907581627, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.03469070536084473, "learning_rate": 1.835638500520605e-05, "loss": 0.0014, "num_tokens": 24030490.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5336653753919941, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.08241589739918709, "learning_rate": 1.8354615981951492e-05, "loss": 0.0033, "num_tokens": 24035847.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5338498432023612, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.0436588975135237, "learning_rate": 1.835284609254591e-05, "loss": 0.0017, "num_tokens": 24041836.0, "reward": 1.09375, "reward_std": 0.1735912710428238, "rewards/fixed_code_pass_all_test_reward/mean": 0.09375, "rewards/fixed_code_pass_all_test_reward/std": 0.1735912710428238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 378.25, "completions/mean_terminated_length": 378.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5340343110127282, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.031775604002177715, "learning_rate": 1.835107533717279e-05, "loss": 0.0013, "num_tokens": 24052846.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.5342187788230953, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.08700443943962455, "learning_rate": 1.834930371601572e-05, "loss": 0.0035, "num_tokens": 24056668.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 416.875, "completions/mean_terminated_length": 416.875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.5344032466334625, "frac_reward_zero_std": 1.0, "grad_norm": 1.203125, "kl": 0.1006898726336658, "learning_rate": 1.8347531229258356e-05, "loss": 0.004, "num_tokens": 24066675.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5345877144438296, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.07155615463852882, "learning_rate": 1.8345757877084472e-05, "loss": 0.0029, "num_tokens": 24076123.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 233.875, "completions/mean_terminated_length": 233.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5347721822541966, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.02944751491304487, "learning_rate": 1.8343983659677912e-05, "loss": 0.0012, "num_tokens": 24081730.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5349566500645637, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.06105159432627261, "learning_rate": 1.834220857722261e-05, "loss": 0.0024, "num_tokens": 24090595.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5351411178749308, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.04223503312096, "learning_rate": 1.8340432629902603e-05, "loss": 0.0017, "num_tokens": 24094483.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 638.625, "completions/mean_terminated_length": 638.625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5353255856852979, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.05114214518107474, "learning_rate": 1.8338655817902005e-05, "loss": 0.002, "num_tokens": 24108736.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 413.625, "completions/mean_terminated_length": 413.625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.535510053495665, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.07517574448138475, "learning_rate": 1.8336878141405026e-05, "loss": 0.003, "num_tokens": 24118877.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5356945213060321, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.04034720407798886, "learning_rate": 1.833509960059596e-05, "loss": 0.0016, "num_tokens": 24125490.0, "reward": 1.4583332538604736, "reward_std": 0.44854259490966797, "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.44854262471199036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5358789891163992, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.02978092583362013, "learning_rate": 1.8333320195659197e-05, "loss": 0.0012, "num_tokens": 24132221.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 384.25, "completions/mean_terminated_length": 384.25, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.5360634569267663, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.06300222594290972, "learning_rate": 1.8331539926779214e-05, "loss": 0.0025, "num_tokens": 24141807.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5362479247371333, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.03432313329540193, "learning_rate": 1.832975879414058e-05, "loss": 0.0014, "num_tokens": 24151252.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 419.625, "completions/mean_terminated_length": 419.625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5364323925475004, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.03202848241198808, "learning_rate": 1.8327976797927946e-05, "loss": 0.0013, "num_tokens": 24160617.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5366168603578676, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.09069602843374014, "learning_rate": 1.832619393832606e-05, "loss": 0.0036, "num_tokens": 24168982.0, "reward": 1.875, "reward_std": 0.02022511698305607, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.02022511698305607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 532.75, "completions/mean_terminated_length": 532.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5368013281682347, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.04365922592114657, "learning_rate": 1.8324410215519755e-05, "loss": 0.0017, "num_tokens": 24184460.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 646.625, "completions/mean_terminated_length": 646.625, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.5369857959786017, "frac_reward_zero_std": 0.0, "grad_norm": 0.58203125, "kl": 0.0212044978979975, "learning_rate": 1.8322625629693957e-05, "loss": 0.0008, "num_tokens": 24198761.0, "reward": 1.4583333730697632, "reward_std": 0.501980185508728, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 486.25, "completions/mean_terminated_length": 486.25, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5371702637889688, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.03286727063823491, "learning_rate": 1.832084018103368e-05, "loss": 0.0013, "num_tokens": 24211291.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5373547315993359, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.059785882011055946, "learning_rate": 1.8319053869724032e-05, "loss": 0.0024, "num_tokens": 24216351.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 223.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.537539199409703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.030113519955193624, "learning_rate": 1.8317266695950196e-05, "loss": 0.0012, "num_tokens": 24221560.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 539.625, "completions/mean_terminated_length": 539.625, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.5377236672200701, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.042458487674593925, "learning_rate": 1.8315478659897464e-05, "loss": 0.0017, "num_tokens": 24231309.0, "reward": 1.1666667461395264, "reward_std": 0.10286887735128403, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.10286889970302582, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 489.875, "completions/mean_terminated_length": 489.875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.5379081350304372, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.048516184790059924, "learning_rate": 1.8313689761751197e-05, "loss": 0.0019, "num_tokens": 24241692.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 415.125, "completions/mean_terminated_length": 415.125, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.5380926028408043, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.031217518961057067, "learning_rate": 1.831190000169687e-05, "loss": 0.0012, "num_tokens": 24251709.0, "reward": 1.942307710647583, "reward_std": 0.1631784588098526, "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.1631784737110138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 493.375, "completions/mean_terminated_length": 493.375, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5382770706511714, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0719007640145719, "learning_rate": 1.831010937992002e-05, "loss": 0.0029, "num_tokens": 24260248.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5384615384615384, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.05667279870249331, "learning_rate": 1.8308317896606298e-05, "loss": 0.0023, "num_tokens": 24268795.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5386460062719055, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.04545099101960659, "learning_rate": 1.8306525551941424e-05, "loss": 0.0018, "num_tokens": 24275893.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5388304740822727, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.04677803046070039, "learning_rate": 1.8304732346111224e-05, "loss": 0.0019, "num_tokens": 24284413.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 402.5, "completions/mean_terminated_length": 402.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5390149418926398, "frac_reward_zero_std": 1.0, "grad_norm": 0.037353515625, "kl": 0.030509008560329676, "learning_rate": 1.8302938279301597e-05, "loss": 0.0012, "num_tokens": 24293833.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5391994097030068, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05443519772961736, "learning_rate": 1.8301143351698547e-05, "loss": 0.0022, "num_tokens": 24301057.0, "reward": 1.6304347515106201, "reward_std": 0.33597835898399353, "rewards/fixed_code_pass_all_test_reward/mean": 0.6304348111152649, "rewards/fixed_code_pass_all_test_reward/std": 0.33597832918167114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 439.5, "completions/mean_terminated_length": 439.5, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.5393838775133739, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.04643758945167065, "learning_rate": 1.8299347563488158e-05, "loss": 0.0019, "num_tokens": 24312133.0, "reward": 0.9516129493713379, "reward_std": 0.5123711228370667, "rewards/fixed_code_pass_all_test_reward/mean": 0.20161288976669312, "rewards/fixed_code_pass_all_test_reward/std": 0.08224225044250488, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.539568345323741, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.02165124472230673, "learning_rate": 1.8297550914856602e-05, "loss": 0.0009, "num_tokens": 24316589.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5397528131341081, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.043638721108436584, "learning_rate": 1.8295753405990152e-05, "loss": 0.0017, "num_tokens": 24326086.0, "reward": 1.7236841917037964, "reward_std": 0.4282745122909546, "rewards/fixed_code_pass_all_test_reward/mean": 0.7236841917037964, "rewards/fixed_code_pass_all_test_reward/std": 0.428274542093277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 427.625, "completions/mean_terminated_length": 427.625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5399372809444752, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.04224176541902125, "learning_rate": 1.8293955037075152e-05, "loss": 0.0017, "num_tokens": 24337275.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5401217487548423, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.0699792105006054, "learning_rate": 1.8292155808298054e-05, "loss": 0.0028, "num_tokens": 24342729.0, "reward": 0.9375, "reward_std": 0.33407655358314514, "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 288.0, "completions/mean_terminated_length": 288.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5403062165652094, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.09865896543487906, "learning_rate": 1.8290355719845384e-05, "loss": 0.0039, "num_tokens": 24349033.0, "reward": 1.4375, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 397.75, "completions/mean_terminated_length": 397.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5404906843755765, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.04213524959050119, "learning_rate": 1.828855477190376e-05, "loss": 0.0017, "num_tokens": 24356799.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 492.375, "completions/mean_terminated_length": 492.375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.5406751521859435, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.02584608062170446, "learning_rate": 1.8286752964659902e-05, "loss": 0.001, "num_tokens": 24366314.0, "reward": 1.7708333730697632, "reward_std": 0.09708036482334137, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.09708039462566376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5408596199963106, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.06530005624517798, "learning_rate": 1.8284950298300605e-05, "loss": 0.0026, "num_tokens": 24376050.0, "reward": 1.796875, "reward_std": 0.3892385959625244, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 507.625, "completions/mean_terminated_length": 507.625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.5410440878066778, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.030729789868928492, "learning_rate": 1.8283146773012754e-05, "loss": 0.0012, "num_tokens": 24387287.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5412285556170449, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.03776670293882489, "learning_rate": 1.8281342388983332e-05, "loss": 0.0015, "num_tokens": 24391045.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5414130234274119, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03434872371144593, "learning_rate": 1.82795371463994e-05, "loss": 0.0014, "num_tokens": 24397520.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.541597491237779, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.034962830948643386, "learning_rate": 1.827773104544812e-05, "loss": 0.0014, "num_tokens": 24406308.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5417819590481461, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.0374046522192657, "learning_rate": 1.827592408631673e-05, "loss": 0.0015, "num_tokens": 24412549.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 924.75, "completions/mean_terminated_length": 924.75, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.5419664268585132, "frac_reward_zero_std": 0.0, "grad_norm": 0.578125, "kl": 0.015889783506281674, "learning_rate": 1.827411626919257e-05, "loss": 0.0006, "num_tokens": 24430483.0, "reward": 1.7708332538604736, "reward_std": 0.17677666246891022, "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766773700714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 263.875, "completions/mean_terminated_length": 263.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5421508946688803, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.07926293043419719, "learning_rate": 1.8272307594263056e-05, "loss": 0.0032, "num_tokens": 24436546.0, "reward": 1.8250000476837158, "reward_std": 0.36154431104660034, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 595.875, "completions/mean_terminated_length": 595.875, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.5423353624792474, "frac_reward_zero_std": 0.0, "grad_norm": 0.58203125, "kl": 0.018995413440279663, "learning_rate": 1.8270498061715703e-05, "loss": 0.0008, "num_tokens": 24445945.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5425198302896145, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.03580684750340879, "learning_rate": 1.826868767173811e-05, "loss": 0.0014, "num_tokens": 24453727.0, "reward": 1.6527776718139648, "reward_std": 0.20945467054843903, "rewards/fixed_code_pass_all_test_reward/mean": 0.6527777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.20945467054843903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 441.875, "completions/mean_terminated_length": 441.875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.5427042980999816, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.033507868414744735, "learning_rate": 1.826687642451797e-05, "loss": 0.0013, "num_tokens": 24464790.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 494.5, "completions/mean_terminated_length": 494.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.5428887659103486, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.050762703409418464, "learning_rate": 1.8265064320243056e-05, "loss": 0.002, "num_tokens": 24476642.0, "reward": 1.52173912525177, "reward_std": 0.22411949932575226, "rewards/fixed_code_pass_all_test_reward/mean": 0.52173912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.22411948442459106, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 621.75, "completions/mean_terminated_length": 621.75, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.5430732337207157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.05861212476156652, "learning_rate": 1.826325135910124e-05, "loss": 0.0023, "num_tokens": 24491360.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5432577015310828, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.05620584962889552, "learning_rate": 1.826143754128047e-05, "loss": 0.0022, "num_tokens": 24499376.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 727.625, "completions/mean_terminated_length": 727.625, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.54344216934145, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.03077749314252287, "learning_rate": 1.8259622866968793e-05, "loss": 0.0012, "num_tokens": 24511725.0, "reward": 1.8229167461395264, "reward_std": 0.2651650607585907, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.543626637151817, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.06243800977244973, "learning_rate": 1.8257807336354353e-05, "loss": 0.0025, "num_tokens": 24522037.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5438111049621841, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.05290189303923398, "learning_rate": 1.8255990949625356e-05, "loss": 0.0021, "num_tokens": 24532666.0, "reward": 1.8928570747375488, "reward_std": 0.30304577946662903, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 399.75, "completions/mean_terminated_length": 399.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5439955727725512, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05672035925090313, "learning_rate": 1.8254173706970125e-05, "loss": 0.0023, "num_tokens": 24540304.0, "reward": 1.732758641242981, "reward_std": 0.24014461040496826, "rewards/fixed_code_pass_all_test_reward/mean": 0.7327585816383362, "rewards/fixed_code_pass_all_test_reward/std": 0.24014464020729065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 405.125, "completions/mean_terminated_length": 405.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.5441800405829182, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.05459321022499353, "learning_rate": 1.8252355608577054e-05, "loss": 0.0022, "num_tokens": 24550225.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5443645083932853, "frac_reward_zero_std": 1.0, "grad_norm": 0.205078125, "kl": 0.05239320429973304, "learning_rate": 1.825053665463463e-05, "loss": 0.0021, "num_tokens": 24559874.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5445489762036525, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.04318390681874007, "learning_rate": 1.8248716845331435e-05, "loss": 0.0017, "num_tokens": 24566945.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 289.375, "completions/mean_terminated_length": 289.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5447334440140196, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.05769297340884805, "learning_rate": 1.8246896180856132e-05, "loss": 0.0023, "num_tokens": 24575884.0, "reward": 1.0138888359069824, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 516.25, "completions/mean_terminated_length": 516.25, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.5449179118243866, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.02848245552740991, "learning_rate": 1.824507466139748e-05, "loss": 0.0011, "num_tokens": 24584422.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 464.5, "completions/mean_terminated_length": 464.5, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.5451023796347537, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.02924036094918847, "learning_rate": 1.8243252287144312e-05, "loss": 0.0012, "num_tokens": 24595882.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5452868474451208, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06980294804088771, "learning_rate": 1.8241429058285568e-05, "loss": 0.0028, "num_tokens": 24600796.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 316.5, "completions/mean_terminated_length": 316.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5454713152554879, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.0644823091570288, "learning_rate": 1.8239604975010266e-05, "loss": 0.0026, "num_tokens": 24611304.0, "reward": 1.5035715103149414, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.545655783065855, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.07922717509791255, "learning_rate": 1.8237780037507512e-05, "loss": 0.0032, "num_tokens": 24619297.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5458402508762221, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.052835587644949555, "learning_rate": 1.823595424596651e-05, "loss": 0.0021, "num_tokens": 24623245.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5460247186865892, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.038339386926963925, "learning_rate": 1.823412760057654e-05, "loss": 0.0015, "num_tokens": 24631877.0, "reward": 1.059999942779541, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.05999999865889549, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5462091864969563, "frac_reward_zero_std": 1.0, "grad_norm": 0.053955078125, "kl": 0.026171072269789875, "learning_rate": 1.8232300101526977e-05, "loss": 0.001, "num_tokens": 24639329.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5463936543073233, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.04062277381308377, "learning_rate": 1.8230471749007286e-05, "loss": 0.0016, "num_tokens": 24647051.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5465781221176904, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.028938467847183347, "learning_rate": 1.822864254320702e-05, "loss": 0.0012, "num_tokens": 24656390.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5467625899280576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.044765309197828174, "learning_rate": 1.8226812484315813e-05, "loss": 0.0018, "num_tokens": 24660535.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5469470577384247, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.04779591062106192, "learning_rate": 1.8224981572523397e-05, "loss": 0.0019, "num_tokens": 24665908.0, "reward": 1.451612949371338, "reward_std": 0.05973030999302864, "rewards/fixed_code_pass_all_test_reward/mean": 0.4516128897666931, "rewards/fixed_code_pass_all_test_reward/std": 0.059730324894189835, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 410.25, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5471315255487917, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.019439691270235926, "learning_rate": 1.8223149808019588e-05, "loss": 0.0008, "num_tokens": 24676726.0, "reward": 1.9525315761566162, "reward_std": 0.13426080346107483, "rewards/fixed_code_pass_all_test_reward/mean": 0.952531635761261, "rewards/fixed_code_pass_all_test_reward/std": 0.13426078855991364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5473159933591588, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.07478530332446098, "learning_rate": 1.8221317190994296e-05, "loss": 0.003, "num_tokens": 24680704.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1259.0, "completions/mean_terminated_length": 1259.0, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 0.5475004611695259, "frac_reward_zero_std": 0.0, "grad_norm": 0.609375, "kl": 0.03837016923353076, "learning_rate": 1.8219483721637506e-05, "loss": 0.0015, "num_tokens": 24703392.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.547684928979893, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.03157455159816891, "learning_rate": 1.8217649400139307e-05, "loss": 0.0013, "num_tokens": 24709733.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 423.625, "completions/mean_terminated_length": 423.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.5478693967902601, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.0464710455853492, "learning_rate": 1.8215814226689867e-05, "loss": 0.0019, "num_tokens": 24720562.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 488.625, "completions/mean_terminated_length": 488.625, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.5480538646006272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.03459405468311161, "learning_rate": 1.821397820147944e-05, "loss": 0.0014, "num_tokens": 24729807.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1913.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 598.0, "completions/mean_terminated_length": 598.0, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.5482383324109943, "frac_reward_zero_std": 0.0, "grad_norm": 0.58984375, "kl": 0.03416860767174512, "learning_rate": 1.821214132469838e-05, "loss": 0.0014, "num_tokens": 24742167.0, "reward": 1.6677632331848145, "reward_std": 0.712117075920105, "rewards/fixed_code_pass_all_test_reward/mean": 0.7927631735801697, "rewards/fixed_code_pass_all_test_reward/std": 0.3944704830646515, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5484228002213614, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.026601176243275404, "learning_rate": 1.8210303596537118e-05, "loss": 0.0011, "num_tokens": 24747570.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5486072680317284, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.03270949190482497, "learning_rate": 1.8208465017186178e-05, "loss": 0.0013, "num_tokens": 24758244.0, "reward": 1.9234694242477417, "reward_std": 0.21646122634410858, "rewards/fixed_code_pass_all_test_reward/mean": 0.9234694242477417, "rewards/fixed_code_pass_all_test_reward/std": 0.21646127104759216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 353.875, "completions/mean_terminated_length": 353.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5487917358420955, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.0776722792070359, "learning_rate": 1.8206625586836174e-05, "loss": 0.0031, "num_tokens": 24765035.0, "reward": 1.816176414489746, "reward_std": 0.3686121702194214, "rewards/fixed_code_pass_all_test_reward/mean": 0.8161764740943909, "rewards/fixed_code_pass_all_test_reward/std": 0.3686121702194214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5489762036524627, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.040628436487168074, "learning_rate": 1.8204785305677807e-05, "loss": 0.0016, "num_tokens": 24774255.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5491606714628298, "frac_reward_zero_std": 1.0, "grad_norm": 0.306640625, "kl": 0.08926429552957416, "learning_rate": 1.8202944173901856e-05, "loss": 0.0036, "num_tokens": 24778222.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 528.625, "completions/mean_terminated_length": 528.625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.5493451392731968, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03684735030401498, "learning_rate": 1.8201102191699205e-05, "loss": 0.0015, "num_tokens": 24787491.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 691.0, "completions/mean_terminated_length": 691.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.5495296070835639, "frac_reward_zero_std": 0.0, "grad_norm": 0.703125, "kl": 0.020021209842525423, "learning_rate": 1.819925935926082e-05, "loss": 0.0008, "num_tokens": 24799859.0, "reward": 1.4791666269302368, "reward_std": 0.43129098415374756, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.43129101395606995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.549714074893931, "frac_reward_zero_std": 1.0, "grad_norm": 0.055908203125, "kl": 0.0536803244613111, "learning_rate": 1.8197415676777747e-05, "loss": 0.0021, "num_tokens": 24810170.0, "reward": 1.6285715103149414, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.549898542704298, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.06499074632301927, "learning_rate": 1.8195571144441137e-05, "loss": 0.0026, "num_tokens": 24817402.0, "reward": 1.1612902879714966, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.16129031777381897, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 457.75, "completions/mean_terminated_length": 457.75, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.5500830105146652, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.026158926193602383, "learning_rate": 1.8193725762442206e-05, "loss": 0.001, "num_tokens": 24829872.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5502674783250323, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.07190307090058923, "learning_rate": 1.819187953097228e-05, "loss": 0.0029, "num_tokens": 24836398.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 642.625, "completions/mean_terminated_length": 642.625, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.5504519461353994, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.04322959540877491, "learning_rate": 1.819003245022276e-05, "loss": 0.0017, "num_tokens": 24847747.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5506364139457665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.03564634523354471, "learning_rate": 1.818818452038514e-05, "loss": 0.0014, "num_tokens": 24853787.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.5508208817561335, "frac_reward_zero_std": 1.0, "grad_norm": 0.259765625, "kl": 0.06751763564534485, "learning_rate": 1.8186335741651e-05, "loss": 0.0027, "num_tokens": 24857696.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 372.75, "completions/mean_terminated_length": 372.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5510053495665006, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.046009131940081716, "learning_rate": 1.8184486114212012e-05, "loss": 0.0018, "num_tokens": 24867390.0, "reward": 1.6307470798492432, "reward_std": 0.25275692343711853, "rewards/fixed_code_pass_all_test_reward/mean": 0.6307471394538879, "rewards/fixed_code_pass_all_test_reward/std": 0.25275692343711853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 935.75, "completions/mean_terminated_length": 776.857177734375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.5511898173768678, "frac_reward_zero_std": 0.0, "grad_norm": 0.267578125, "kl": 0.022496830439195037, "learning_rate": 1.8182635638259932e-05, "loss": 0.0009, "num_tokens": 24881452.0, "reward": 1.716346025466919, "reward_std": 0.3557506501674652, "rewards/fixed_code_pass_all_test_reward/mean": 0.8413461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.013598216697573662, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5513742851872349, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.056536976946517825, "learning_rate": 1.8180784313986603e-05, "loss": 0.0023, "num_tokens": 24890785.0, "reward": 1.3134920597076416, "reward_std": 0.45380091667175293, "rewards/fixed_code_pass_all_test_reward/mean": 0.3134920597076416, "rewards/fixed_code_pass_all_test_reward/std": 0.45380091667175293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.5515587529976019, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.07373393326997757, "learning_rate": 1.817893214158396e-05, "loss": 0.0029, "num_tokens": 24894333.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.551743220807969, "frac_reward_zero_std": 1.0, "grad_norm": 0.1787109375, "kl": 0.052373128943145275, "learning_rate": 1.8177079121244023e-05, "loss": 0.0021, "num_tokens": 24899230.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5519276886183361, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.03477802500128746, "learning_rate": 1.8175225253158903e-05, "loss": 0.0014, "num_tokens": 24905516.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 483.75, "completions/mean_terminated_length": 483.75, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.5521121564287031, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.018508803623262793, "learning_rate": 1.8173370537520792e-05, "loss": 0.0007, "num_tokens": 24914802.0, "reward": 1.8020833730697632, "reward_std": 0.3271373212337494, "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3271373510360718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 490.25, "completions/mean_terminated_length": 490.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5522966242390703, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.025187405291944742, "learning_rate": 1.8171514974521982e-05, "loss": 0.001, "num_tokens": 24923196.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 352.625, "completions/mean_terminated_length": 352.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5524810920494374, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.03938381338957697, "learning_rate": 1.816965856435484e-05, "loss": 0.0016, "num_tokens": 24931857.0, "reward": 1.9936224222183228, "reward_std": 0.018038442358374596, "rewards/fixed_code_pass_all_test_reward/mean": 0.9936224222183228, "rewards/fixed_code_pass_all_test_reward/std": 0.01803842931985855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5526655598598045, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.04877182422205806, "learning_rate": 1.816780130721182e-05, "loss": 0.002, "num_tokens": 24937877.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 281.5, "completions/mean_terminated_length": 281.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5528500276701716, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0493067423813045, "learning_rate": 1.8165943203285484e-05, "loss": 0.002, "num_tokens": 24943633.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5530344954805386, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.0677804525475949, "learning_rate": 1.8164084252768463e-05, "loss": 0.0027, "num_tokens": 24950539.0, "reward": 1.779411792755127, "reward_std": 0.054460056126117706, "rewards/fixed_code_pass_all_test_reward/mean": 0.779411792755127, "rewards/fixed_code_pass_all_test_reward/std": 0.05446000397205353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 458.0, "completions/mean_terminated_length": 458.0, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.5532189632909057, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.05057398322969675, "learning_rate": 1.8162224455853474e-05, "loss": 0.002, "num_tokens": 24963915.0, "reward": 1.875, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 530.375, "completions/mean_terminated_length": 530.375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.5534034311012729, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.039687435259111226, "learning_rate": 1.8160363812733336e-05, "loss": 0.0016, "num_tokens": 24973558.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.55358789891164, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.04045181313995272, "learning_rate": 1.8158502323600943e-05, "loss": 0.0016, "num_tokens": 24981029.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.553772366722007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0277099609375, "kl": 0.01385993673466146, "learning_rate": 1.8156639988649285e-05, "loss": 0.0006, "num_tokens": 24991154.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.5539568345323741, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.050731312832795084, "learning_rate": 1.8154776808071436e-05, "loss": 0.002, "num_tokens": 25000017.0, "reward": 1.0416666269302368, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5541413023427412, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.054657270666211843, "learning_rate": 1.8152912782060556e-05, "loss": 0.0022, "num_tokens": 25006931.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5543257701531082, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04281378327868879, "learning_rate": 1.8151047910809898e-05, "loss": 0.0017, "num_tokens": 25017217.0, "reward": 1.8125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 498.875, "completions/mean_terminated_length": 498.875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.5545102379634754, "frac_reward_zero_std": 1.0, "grad_norm": 0.032958984375, "kl": 0.023587030009366572, "learning_rate": 1.81491821945128e-05, "loss": 0.0009, "num_tokens": 25030024.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.5546947057738425, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "kl": 0.06078086397610605, "learning_rate": 1.8147315633362682e-05, "loss": 0.0024, "num_tokens": 25034901.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5548791735842096, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.04330155742354691, "learning_rate": 1.814544822755306e-05, "loss": 0.0017, "num_tokens": 25041126.0, "reward": 0.9471153616905212, "reward_std": 0.42079177498817444, "rewards/fixed_code_pass_all_test_reward/mean": 0.19711539149284363, "rewards/fixed_code_pass_all_test_reward/std": 0.19223898649215698, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.5550636413945766, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.07741780462674797, "learning_rate": 1.8143579977277534e-05, "loss": 0.0031, "num_tokens": 25044805.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 401.0, "completions/mean_terminated_length": 401.0, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.5552481092049437, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06641779514029622, "learning_rate": 1.8141710882729792e-05, "loss": 0.0027, "num_tokens": 25056597.0, "reward": 1.8977272510528564, "reward_std": 0.28927093744277954, "rewards/fixed_code_pass_all_test_reward/mean": 0.8977272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.28927096724510193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5554325770153108, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.030493160127662122, "learning_rate": 1.813984094410361e-05, "loss": 0.0012, "num_tokens": 25063162.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5556170448256779, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.036026187939569354, "learning_rate": 1.813797016159285e-05, "loss": 0.0014, "num_tokens": 25071313.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 517.0, "completions/mean_terminated_length": 517.0, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.555801512636045, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.018412979901768267, "learning_rate": 1.813609853539146e-05, "loss": 0.0007, "num_tokens": 25079977.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5559859804464121, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.04467348847538233, "learning_rate": 1.813422606569348e-05, "loss": 0.0018, "num_tokens": 25084445.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 363.25, "completions/mean_terminated_length": 363.25, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.5561704482567792, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.03971067944075912, "learning_rate": 1.8132352752693038e-05, "loss": 0.0016, "num_tokens": 25091967.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.5563549160671463, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.0852273222990334, "learning_rate": 1.813047859658434e-05, "loss": 0.0034, "num_tokens": 25101112.0, "reward": 1.5059523582458496, "reward_std": 0.5282528400421143, "rewards/fixed_code_pass_all_test_reward/mean": 0.5059523582458496, "rewards/fixed_code_pass_all_test_reward/std": 0.5282528400421143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 358.0, "completions/mean_terminated_length": 358.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5565393838775133, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.05209075682796538, "learning_rate": 1.8128603597561693e-05, "loss": 0.0021, "num_tokens": 25108192.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5567238516878804, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.028394917375408113, "learning_rate": 1.8126727755819477e-05, "loss": 0.0011, "num_tokens": 25112659.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5569083194982476, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.028425589785911143, "learning_rate": 1.8124851071552176e-05, "loss": 0.0011, "num_tokens": 25121429.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 366.125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.5570927873086147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.03395583666861057, "learning_rate": 1.8122973544954346e-05, "loss": 0.0014, "num_tokens": 25132254.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.5572772551189817, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.03574182232841849, "learning_rate": 1.8121095176220635e-05, "loss": 0.0014, "num_tokens": 25141977.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5574617229293488, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.04595293151214719, "learning_rate": 1.811921596554578e-05, "loss": 0.0018, "num_tokens": 25148627.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.5576461907397159, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.018557009403593838, "learning_rate": 1.8117335913124613e-05, "loss": 0.0007, "num_tokens": 25153198.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 631.25, "completions/mean_terminated_length": 631.25, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.557830658550083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.028582318976987153, "learning_rate": 1.8115455019152038e-05, "loss": 0.0011, "num_tokens": 25168432.0, "reward": 1.1333333253860474, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5580151263604501, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.03259554575197399, "learning_rate": 1.8113573283823056e-05, "loss": 0.0013, "num_tokens": 25179767.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 406.75, "completions/mean_terminated_length": 406.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.5581995941708172, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.05117500759661198, "learning_rate": 1.811169070733275e-05, "loss": 0.002, "num_tokens": 25190781.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 253.0, "completions/mean_terminated_length": 253.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5583840619811843, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.048692459939047694, "learning_rate": 1.8109807289876294e-05, "loss": 0.0019, "num_tokens": 25197029.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5585685297915514, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.04838477657176554, "learning_rate": 1.8107923031648952e-05, "loss": 0.0019, "num_tokens": 25207691.0, "reward": 1.75, "reward_std": 0.3162277340888977, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3162277638912201, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5587529976019184, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.07281749369576573, "learning_rate": 1.810603793284607e-05, "loss": 0.0029, "num_tokens": 25213794.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 400.125, "completions/mean_terminated_length": 400.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.5589374654122855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.041095297085121274, "learning_rate": 1.810415199366308e-05, "loss": 0.0016, "num_tokens": 25221651.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5591219332226527, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04006575420498848, "learning_rate": 1.8102265214295506e-05, "loss": 0.0016, "num_tokens": 25230580.0, "reward": 1.9083333015441895, "reward_std": 0.2592725157737732, "rewards/fixed_code_pass_all_test_reward/mean": 0.9083333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.2592725157737732, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5593064010330198, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.02845871902536601, "learning_rate": 1.8100377594938948e-05, "loss": 0.0011, "num_tokens": 25237023.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 374.625, "completions/mean_terminated_length": 374.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5594908688433868, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.04617732926271856, "learning_rate": 1.809848913578912e-05, "loss": 0.0018, "num_tokens": 25247388.0, "reward": 1.7395832538604736, "reward_std": 0.4351552426815033, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.32101497054100037, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5596753366537539, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.032301858940627426, "learning_rate": 1.8096599837041786e-05, "loss": 0.0013, "num_tokens": 25254378.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.559859804464121, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04430544306524098, "learning_rate": 1.809470969889283e-05, "loss": 0.0018, "num_tokens": 25260555.0, "reward": 1.7333333492279053, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7333333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.560044272274488, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.02662464464083314, "learning_rate": 1.80928187215382e-05, "loss": 0.0011, "num_tokens": 25265205.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5602287400848552, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03058117430191487, "learning_rate": 1.8090926905173944e-05, "loss": 0.0012, "num_tokens": 25273851.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.5604132078952223, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.02531021786853671, "learning_rate": 1.8089034249996193e-05, "loss": 0.001, "num_tokens": 25280400.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5605976757055894, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.06138602364808321, "learning_rate": 1.8087140756201164e-05, "loss": 0.0025, "num_tokens": 25291095.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5607821435159565, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.0846156133338809, "learning_rate": 1.8085246423985157e-05, "loss": 0.0034, "num_tokens": 25295016.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5609666113263235, "frac_reward_zero_std": 1.0, "grad_norm": 0.051513671875, "kl": 0.036287104012444615, "learning_rate": 1.808335125354457e-05, "loss": 0.0015, "num_tokens": 25306211.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5611510791366906, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.03876070445403457, "learning_rate": 1.8081455245075885e-05, "loss": 0.0016, "num_tokens": 25313547.0, "reward": 1.076923131942749, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.07692307978868484, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 497.625, "completions/mean_terminated_length": 497.625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.5613355469470578, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.03324142191559076, "learning_rate": 1.807955839877566e-05, "loss": 0.0013, "num_tokens": 25327168.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 624.25, "completions/mean_terminated_length": 624.25, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.5615200147574249, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.03152924100868404, "learning_rate": 1.8077660714840552e-05, "loss": 0.0013, "num_tokens": 25338234.0, "reward": 1.137930989265442, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13793103396892548, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 297.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5617044825677919, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.061444617342203856, "learning_rate": 1.8075762193467296e-05, "loss": 0.0025, "num_tokens": 25346980.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 524.375, "completions/mean_terminated_length": 524.375, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.561888950378159, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.036382993683218956, "learning_rate": 1.807386283485272e-05, "loss": 0.0015, "num_tokens": 25357015.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 424.5, "completions/mean_terminated_length": 424.5, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5620734181885261, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.03431850136257708, "learning_rate": 1.807196263919374e-05, "loss": 0.0014, "num_tokens": 25364779.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5622578859988931, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.02997777797281742, "learning_rate": 1.8070061606687354e-05, "loss": 0.0012, "num_tokens": 25370564.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5624423538092603, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.06041325395926833, "learning_rate": 1.8068159737530644e-05, "loss": 0.0024, "num_tokens": 25374692.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.5626268216196274, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0637503219768405, "learning_rate": 1.8066257031920788e-05, "loss": 0.0026, "num_tokens": 25382791.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 255.25, "completions/mean_terminated_length": 255.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5628112894299945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.05269561498425901, "learning_rate": 1.8064353490055046e-05, "loss": 0.0021, "num_tokens": 25390393.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 141.625, "completions/mean_terminated_length": 141.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5629957572403615, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.056827078107744455, "learning_rate": 1.806244911213076e-05, "loss": 0.0023, "num_tokens": 25394246.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 662.0, "completions/mean_terminated_length": 662.0, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.5631802250507286, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.047894252464175224, "learning_rate": 1.8060543898345368e-05, "loss": 0.0019, "num_tokens": 25410182.0, "reward": 1.1411290168762207, "reward_std": 0.6388952732086182, "rewards/fixed_code_pass_all_test_reward/mean": 0.2661290168762207, "rewards/fixed_code_pass_all_test_reward/std": 0.4551376700401306, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5633646928610957, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.04374005342833698, "learning_rate": 1.8058637848896387e-05, "loss": 0.0017, "num_tokens": 25418490.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 366.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5635491606714629, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.038498512003570795, "learning_rate": 1.8056730963981426e-05, "loss": 0.0015, "num_tokens": 25427549.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.56373362848183, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.04802217590622604, "learning_rate": 1.8054823243798178e-05, "loss": 0.0019, "num_tokens": 25432445.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.563918096292197, "frac_reward_zero_std": 0.0, "grad_norm": 0.69140625, "kl": 0.03593837353400886, "learning_rate": 1.8052914688544416e-05, "loss": 0.0014, "num_tokens": 25439139.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5641025641025641, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.023547242977656424, "learning_rate": 1.8051005298418016e-05, "loss": 0.0009, "num_tokens": 25443968.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5642870319129312, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.028751458739861846, "learning_rate": 1.8049095073616927e-05, "loss": 0.0012, "num_tokens": 25450577.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 683.375, "completions/mean_terminated_length": 683.375, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 0.5644714997232982, "frac_reward_zero_std": 1.0, "grad_norm": 0.03662109375, "kl": 0.02337012381758541, "learning_rate": 1.804718401433919e-05, "loss": 0.0009, "num_tokens": 25471260.0, "reward": 1.0322580337524414, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.032258063554763794, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5646559675336654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.05154152354225516, "learning_rate": 1.8045272120782926e-05, "loss": 0.0021, "num_tokens": 25479527.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5648404353440325, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.07118550734594464, "learning_rate": 1.804335939314635e-05, "loss": 0.0028, "num_tokens": 25487228.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 385.375, "completions/mean_terminated_length": 385.375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.5650249031543996, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.051041068974882364, "learning_rate": 1.8041445831627765e-05, "loss": 0.002, "num_tokens": 25496831.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.5652093709647666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.05082690296694636, "learning_rate": 1.8039531436425548e-05, "loss": 0.002, "num_tokens": 25504009.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 500.125, "completions/mean_terminated_length": 500.125, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.5653938387751337, "frac_reward_zero_std": 1.0, "grad_norm": 0.052001953125, "kl": 0.033118094550445676, "learning_rate": 1.803761620773818e-05, "loss": 0.0013, "num_tokens": 25512810.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 512.375, "completions/mean_terminated_length": 512.375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.5655783065855008, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.027211367269046605, "learning_rate": 1.8035700145764213e-05, "loss": 0.0011, "num_tokens": 25522877.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.565762774395868, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.053582322085276246, "learning_rate": 1.803378325070229e-05, "loss": 0.0021, "num_tokens": 25532625.0, "reward": 1.578125, "reward_std": 0.46740877628326416, "rewards/fixed_code_pass_all_test_reward/mean": 0.578125, "rewards/fixed_code_pass_all_test_reward/std": 0.46740880608558655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 376.25, "completions/mean_terminated_length": 376.25, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.565947242206235, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.03318937751464546, "learning_rate": 1.8031865522751147e-05, "loss": 0.0013, "num_tokens": 25542843.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5661317100166021, "frac_reward_zero_std": 1.0, "grad_norm": 0.3046875, "kl": 0.052035853266716, "learning_rate": 1.80299469621096e-05, "loss": 0.0021, "num_tokens": 25551843.0, "reward": 1.8333332538604736, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 522.0, "completions/mean_terminated_length": 522.0, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.5663161778269692, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.03220933233387768, "learning_rate": 1.8028027568976555e-05, "loss": 0.0013, "num_tokens": 25565651.0, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 349.25, "completions/mean_terminated_length": 349.25, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5665006456373363, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.040862676221877337, "learning_rate": 1.8026107343550996e-05, "loss": 0.0016, "num_tokens": 25575301.0, "reward": 1.850806474685669, "reward_std": 0.35038575530052185, "rewards/fixed_code_pass_all_test_reward/mean": 0.850806474685669, "rewards/fixed_code_pass_all_test_reward/std": 0.35038575530052185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 517.25, "completions/mean_terminated_length": 517.25, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.5666851134477033, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.05594922695308924, "learning_rate": 1.8024186286032002e-05, "loss": 0.0022, "num_tokens": 25584455.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 460.75, "completions/mean_terminated_length": 460.75, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5668695812580705, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.03176037850789726, "learning_rate": 1.8022264396618732e-05, "loss": 0.0013, "num_tokens": 25592805.0, "reward": 1.894230842590332, "reward_std": 0.1958465278148651, "rewards/fixed_code_pass_all_test_reward/mean": 0.8942307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.1958465576171875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5670540490684376, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.057953457813709974, "learning_rate": 1.8020341675510446e-05, "loss": 0.0023, "num_tokens": 25604821.0, "reward": 1.567307710647583, "reward_std": 0.2670634984970093, "rewards/fixed_code_pass_all_test_reward/mean": 0.567307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.2670634984970093, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5672385168788047, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.06375005072914064, "learning_rate": 1.8018418122906465e-05, "loss": 0.0026, "num_tokens": 25610032.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5674229846891717, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.06240540253929794, "learning_rate": 1.801649373900622e-05, "loss": 0.0025, "num_tokens": 25619114.0, "reward": 1.315000057220459, "reward_std": 0.5802216529846191, "rewards/fixed_code_pass_all_test_reward/mean": 0.4399999976158142, "rewards/fixed_code_pass_all_test_reward/std": 0.2931601107120514, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 525.25, "completions/mean_terminated_length": 525.25, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.5676074524995388, "frac_reward_zero_std": 1.0, "grad_norm": 0.04150390625, "kl": 0.029175078379921615, "learning_rate": 1.8014568524009216e-05, "loss": 0.0012, "num_tokens": 25628420.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 421.625, "completions/mean_terminated_length": 421.625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.5677919203099059, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.041583041893318295, "learning_rate": 1.801264247811504e-05, "loss": 0.0017, "num_tokens": 25640017.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 488.875, "completions/mean_terminated_length": 488.875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.567976388120273, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.042831100639887154, "learning_rate": 1.8010715601523383e-05, "loss": 0.0017, "num_tokens": 25650792.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5681608559306401, "frac_reward_zero_std": 1.0, "grad_norm": 0.04248046875, "kl": 0.035763921681791544, "learning_rate": 1.8008787894434003e-05, "loss": 0.0014, "num_tokens": 25659972.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.5683453237410072, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04125258978456259, "learning_rate": 1.800685935704675e-05, "loss": 0.0017, "num_tokens": 25669231.0, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 652.625, "completions/mean_terminated_length": 652.625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.5685297915513743, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.04210096853785217, "learning_rate": 1.8004929989561566e-05, "loss": 0.0017, "num_tokens": 25681076.0, "reward": 1.254166603088379, "reward_std": 0.4604819715023041, "rewards/fixed_code_pass_all_test_reward/mean": 0.2541666626930237, "rewards/fixed_code_pass_all_test_reward/std": 0.46048200130462646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5687142593617414, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.0304510846035555, "learning_rate": 1.8002999792178478e-05, "loss": 0.0012, "num_tokens": 25689730.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5688987271721084, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.05038401670753956, "learning_rate": 1.8001068765097585e-05, "loss": 0.002, "num_tokens": 25694115.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5690831949824755, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.06559236533939838, "learning_rate": 1.79991369085191e-05, "loss": 0.0026, "num_tokens": 25699253.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5692676627928427, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0597276974003762, "learning_rate": 1.799720422264329e-05, "loss": 0.0024, "num_tokens": 25704296.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5694521306032098, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.04970954800955951, "learning_rate": 1.799527070767053e-05, "loss": 0.002, "num_tokens": 25714470.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 424.75, "completions/mean_terminated_length": 424.75, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5696365984135768, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.03950870712287724, "learning_rate": 1.7993336363801272e-05, "loss": 0.0016, "num_tokens": 25722724.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5698210662239439, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.04021455207839608, "learning_rate": 1.7991401191236053e-05, "loss": 0.0016, "num_tokens": 25732521.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.570005534034311, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.039639255846850574, "learning_rate": 1.7989465190175507e-05, "loss": 0.0016, "num_tokens": 25742855.0, "reward": 1.081730842590332, "reward_std": 0.4832080900669098, "rewards/fixed_code_pass_all_test_reward/mean": 0.20673078298568726, "rewards/fixed_code_pass_all_test_reward/std": 0.3230622112751007, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.570190001844678, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.03278004564344883, "learning_rate": 1.798752836082034e-05, "loss": 0.0013, "num_tokens": 25752421.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5703744696550452, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.044994804076850414, "learning_rate": 1.798559070337135e-05, "loss": 0.0018, "num_tokens": 25757314.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 665.0, "completions/mean_terminated_length": 665.0, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.5705589374654123, "frac_reward_zero_std": 0.0, "grad_norm": 0.5546875, "kl": 0.021130827895831317, "learning_rate": 1.798365221802942e-05, "loss": 0.0008, "num_tokens": 25769530.0, "reward": 1.3541667461395264, "reward_std": 0.058925557881593704, "rewards/fixed_code_pass_all_test_reward/mean": 0.3541666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.058925557881593704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5707434052757794, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.06319100176915526, "learning_rate": 1.798171290499552e-05, "loss": 0.0025, "num_tokens": 25776569.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5709278730861465, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.042994259390980005, "learning_rate": 1.7979772764470708e-05, "loss": 0.0017, "num_tokens": 25785342.0, "reward": 1.9659091234207153, "reward_std": 0.09642363339662552, "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5711123408965135, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.03610195638611913, "learning_rate": 1.797783179665612e-05, "loss": 0.0014, "num_tokens": 25791579.0, "reward": 1.1750000715255737, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5712968087068806, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.07988016516901553, "learning_rate": 1.7975890001752987e-05, "loss": 0.0032, "num_tokens": 25800891.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5714812765172478, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.0460772723890841, "learning_rate": 1.7973947379962618e-05, "loss": 0.0018, "num_tokens": 25811073.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5716657443276149, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.030236029415391386, "learning_rate": 1.797200393148641e-05, "loss": 0.0012, "num_tokens": 25816198.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5718502121379819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.05689949379302561, "learning_rate": 1.7970059656525853e-05, "loss": 0.0023, "num_tokens": 25821931.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.572034679948349, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0509294094517827, "learning_rate": 1.796811455528251e-05, "loss": 0.002, "num_tokens": 25830741.0, "reward": 1.2870371341705322, "reward_std": 0.06338176131248474, "rewards/fixed_code_pass_all_test_reward/mean": 0.28703704476356506, "rewards/fixed_code_pass_all_test_reward/std": 0.06338173896074295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 478.25, "completions/mean_terminated_length": 478.25, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.5722191477587161, "frac_reward_zero_std": 1.0, "grad_norm": 0.04443359375, "kl": 0.032214547507464886, "learning_rate": 1.796616862795804e-05, "loss": 0.0013, "num_tokens": 25843247.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 482.125, "completions/mean_terminated_length": 482.125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.5724036155690831, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.030857123900204897, "learning_rate": 1.796422187475418e-05, "loss": 0.0012, "num_tokens": 25852896.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5725880833794503, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.06735233240760863, "learning_rate": 1.7962274295872764e-05, "loss": 0.0027, "num_tokens": 25860578.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 393.5, "completions/mean_terminated_length": 393.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5727725511898174, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.03646807977929711, "learning_rate": 1.7960325891515695e-05, "loss": 0.0015, "num_tokens": 25871566.0, "reward": 1.6477272510528564, "reward_std": 0.32979726791381836, "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.32979726791381836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5729570190001845, "frac_reward_zero_std": 1.0, "grad_norm": 0.8671875, "kl": 0.08107712818309665, "learning_rate": 1.7958376661884974e-05, "loss": 0.0032, "num_tokens": 25875751.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5731414868105515, "frac_reward_zero_std": 1.0, "grad_norm": 0.038818359375, "kl": 0.038643559673801064, "learning_rate": 1.7956426607182687e-05, "loss": 0.0015, "num_tokens": 25885831.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 622.375, "completions/mean_terminated_length": 622.375, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.5733259546209186, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.03831643424928188, "learning_rate": 1.7954475727611002e-05, "loss": 0.0015, "num_tokens": 25899962.0, "reward": 1.9027777910232544, "reward_std": 0.1384914666414261, "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.13849148154258728, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5735104224312857, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.056681342888623476, "learning_rate": 1.795252402337217e-05, "loss": 0.0023, "num_tokens": 25910014.0, "reward": 1.4431817531585693, "reward_std": 0.5464006662368774, "rewards/fixed_code_pass_all_test_reward/mean": 0.5681818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5736948902416529, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.050711131654679775, "learning_rate": 1.7950571494668533e-05, "loss": 0.002, "num_tokens": 25918290.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.57387935805202, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.08169829286634922, "learning_rate": 1.7948618141702516e-05, "loss": 0.0033, "num_tokens": 25927689.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.574063825862387, "frac_reward_zero_std": 1.0, "grad_norm": 0.057861328125, "kl": 0.04632511222735047, "learning_rate": 1.7946663964676626e-05, "loss": 0.0019, "num_tokens": 25936655.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5742482936727541, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.028407818637788296, "learning_rate": 1.7944708963793464e-05, "loss": 0.0011, "num_tokens": 25942313.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.5744327614831212, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.06385754852090031, "learning_rate": 1.794275313925571e-05, "loss": 0.0026, "num_tokens": 25947974.0, "reward": 0.9798386693000793, "reward_std": 0.4050878584384918, "rewards/fixed_code_pass_all_test_reward/mean": 0.10483870655298233, "rewards/fixed_code_pass_all_test_reward/std": 0.09561517834663391, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5746172292934882, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.033226786530576646, "learning_rate": 1.794079649126613e-05, "loss": 0.0013, "num_tokens": 25954792.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 408.625, "completions/mean_terminated_length": 408.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5748016971038554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576171875, "kl": 0.05039692344143987, "learning_rate": 1.793883902002758e-05, "loss": 0.002, "num_tokens": 25965725.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 210.75, "completions/mean_terminated_length": 210.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5749861649142225, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.06900247558951378, "learning_rate": 1.7936880725742992e-05, "loss": 0.0028, "num_tokens": 25974115.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5751706327245896, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.05080791120417416, "learning_rate": 1.7934921608615393e-05, "loss": 0.002, "num_tokens": 25982715.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 359.75, "completions/mean_terminated_length": 359.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5753551005349566, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.044273634208366275, "learning_rate": 1.793296166884789e-05, "loss": 0.0018, "num_tokens": 25990217.0, "reward": 1.9034091234207153, "reward_std": 0.2732003331184387, "rewards/fixed_code_pass_all_test_reward/mean": 0.9034091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.2732003331184387, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.5755395683453237, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.030609617359004915, "learning_rate": 1.7931000906643675e-05, "loss": 0.0012, "num_tokens": 25995810.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5757240361556908, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0639483192935586, "learning_rate": 1.7929039322206028e-05, "loss": 0.0026, "num_tokens": 26005224.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.575908503966058, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.05460383091121912, "learning_rate": 1.792707691573831e-05, "loss": 0.0022, "num_tokens": 26010859.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.576092971776425, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.04581896890886128, "learning_rate": 1.792511368744398e-05, "loss": 0.0018, "num_tokens": 26016970.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5762774395867921, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.05795079516246915, "learning_rate": 1.7923149637526563e-05, "loss": 0.0023, "num_tokens": 26025456.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5764619073971592, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.04561865923460573, "learning_rate": 1.792118476618968e-05, "loss": 0.0018, "num_tokens": 26030280.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 464.0, "completions/mean_terminated_length": 464.0, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5766463752075263, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.03982660220935941, "learning_rate": 1.7919219073637038e-05, "loss": 0.0016, "num_tokens": 26039088.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 431.375, "completions/mean_terminated_length": 431.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.5768308430178933, "frac_reward_zero_std": 1.0, "grad_norm": 0.396484375, "kl": 0.03529946773778647, "learning_rate": 1.7917252560072426e-05, "loss": 0.0014, "num_tokens": 26047403.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5770153108282605, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.05018107523210347, "learning_rate": 1.7915285225699718e-05, "loss": 0.002, "num_tokens": 26055881.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 390.875, "completions/mean_terminated_length": 390.875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.5771997786386276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.038902438594959676, "learning_rate": 1.791331707072288e-05, "loss": 0.0016, "num_tokens": 26067712.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5773842464489947, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.044972452567890286, "learning_rate": 1.791134809534595e-05, "loss": 0.0018, "num_tokens": 26072675.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5775687142593617, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.05014026677235961, "learning_rate": 1.790937829977306e-05, "loss": 0.002, "num_tokens": 26079347.0, "reward": 1.828125, "reward_std": 0.2829807996749878, "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, "rewards/fixed_code_pass_all_test_reward/std": 0.2829807996749878, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5777531820697288, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.05651051318272948, "learning_rate": 1.7907407684208424e-05, "loss": 0.0023, "num_tokens": 26088296.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 683.0, "completions/mean_terminated_length": 683.0, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 0.5779376498800959, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.028272128431126475, "learning_rate": 1.790543624885635e-05, "loss": 0.0011, "num_tokens": 26104872.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 467.75, "completions/mean_terminated_length": 467.75, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.5781221176904631, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.02935690840240568, "learning_rate": 1.7903463993921214e-05, "loss": 0.0012, "num_tokens": 26113542.0, "reward": 1.711111068725586, "reward_std": 0.45184317231178284, "rewards/fixed_code_pass_all_test_reward/mean": 0.7111111283302307, "rewards/fixed_code_pass_all_test_reward/std": 0.4518432021141052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5783065855008301, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.041385607328265905, "learning_rate": 1.790149091960749e-05, "loss": 0.0017, "num_tokens": 26122451.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 511.125, "completions/mean_terminated_length": 511.125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.5784910533111972, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.03805990342516452, "learning_rate": 1.7899517026119735e-05, "loss": 0.0015, "num_tokens": 26135020.0, "reward": 1.638888955116272, "reward_std": 0.029695741832256317, "rewards/fixed_code_pass_all_test_reward/mean": 0.6388888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.02969570830464363, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.5786755211215643, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04869112162850797, "learning_rate": 1.7897542313662586e-05, "loss": 0.0019, "num_tokens": 26142688.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5788599889319314, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.025036659091711044, "learning_rate": 1.7895566782440768e-05, "loss": 0.001, "num_tokens": 26150918.0, "reward": 1.040816307067871, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.040816325694322586, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5790444567422984, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.05975192226469517, "learning_rate": 1.7893590432659093e-05, "loss": 0.0024, "num_tokens": 26159736.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 470.5, "completions/mean_terminated_length": 470.5, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.5792289245526656, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.03821381670422852, "learning_rate": 1.789161326452246e-05, "loss": 0.0015, "num_tokens": 26171164.0, "reward": 1.4166666269302368, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 637.625, "completions/mean_terminated_length": 637.625, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.5794133923630327, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.035475694574415684, "learning_rate": 1.788963527823584e-05, "loss": 0.0014, "num_tokens": 26182193.0, "reward": 1.7629311084747314, "reward_std": 0.32735925912857056, "rewards/fixed_code_pass_all_test_reward/mean": 0.7629310488700867, "rewards/fixed_code_pass_all_test_reward/std": 0.32735928893089294, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5795978601733998, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.0821554206777364, "learning_rate": 1.78876564740043e-05, "loss": 0.0033, "num_tokens": 26186321.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.5797823279837668, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.0441164537332952, "learning_rate": 1.788567685203299e-05, "loss": 0.0018, "num_tokens": 26190688.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5799667957941339, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.05269231228157878, "learning_rate": 1.7883696412527148e-05, "loss": 0.0021, "num_tokens": 26214275.0, "reward": 1.9962348937988281, "reward_std": 0.010649183765053749, "rewards/fixed_code_pass_all_test_reward/mean": 0.9962349534034729, "rewards/fixed_code_pass_all_test_reward/std": 0.010649202391505241, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 460.0, "completions/mean_terminated_length": 460.0, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.580151263604501, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.026075148256495595, "learning_rate": 1.788171515569209e-05, "loss": 0.001, "num_tokens": 26222563.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.580335731414868, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.04716695472598076, "learning_rate": 1.7879733081733216e-05, "loss": 0.0019, "num_tokens": 26227012.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 467.125, "completions/mean_terminated_length": 467.125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.5805201992252352, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.04029440716840327, "learning_rate": 1.7877750190856022e-05, "loss": 0.0016, "num_tokens": 26238157.0, "reward": 1.5854430198669434, "reward_std": 0.40623417496681213, "rewards/fixed_code_pass_all_test_reward/mean": 0.7104430198669434, "rewards/fixed_code_pass_all_test_reward/std": 0.11940551549196243, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 698.125, "completions/mean_terminated_length": 698.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.5807046670356023, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.024738609325140715, "learning_rate": 1.787576648326607e-05, "loss": 0.001, "num_tokens": 26253358.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5808891348459694, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.04490569664631039, "learning_rate": 1.7873781959169026e-05, "loss": 0.0018, "num_tokens": 26262338.0, "reward": 1.9296875, "reward_std": 0.19887378811836243, "rewards/fixed_code_pass_all_test_reward/mean": 0.9296875, "rewards/fixed_code_pass_all_test_reward/std": 0.19887378811836243, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5810736026563365, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.04533425020053983, "learning_rate": 1.7871796618770632e-05, "loss": 0.0018, "num_tokens": 26269568.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5812580704667035, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0635696614626795, "learning_rate": 1.7869810462276712e-05, "loss": 0.0025, "num_tokens": 26277903.0, "reward": 1.9249999523162842, "reward_std": 0.10350986570119858, "rewards/fixed_code_pass_all_test_reward/mean": 0.9249999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 511.75, "completions/mean_terminated_length": 511.75, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.5814425382770706, "frac_reward_zero_std": 1.0, "grad_norm": 0.060791015625, "kl": 0.04051905055530369, "learning_rate": 1.7867823489893175e-05, "loss": 0.0016, "num_tokens": 26287469.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5816270060874378, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.09875365742482245, "learning_rate": 1.7865835701826025e-05, "loss": 0.004, "num_tokens": 26292364.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5818114738978049, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.049717559479177, "learning_rate": 1.7863847098281336e-05, "loss": 0.002, "num_tokens": 26304880.0, "reward": 1.6989796161651611, "reward_std": 0.13169653713703156, "rewards/fixed_code_pass_all_test_reward/mean": 0.6989796161651611, "rewards/fixed_code_pass_all_test_reward/std": 0.13169650733470917, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5819959417081719, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0691617710981518, "learning_rate": 1.7861857679465275e-05, "loss": 0.0028, "num_tokens": 26312839.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.582180409518539, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.029023444862104952, "learning_rate": 1.7859867445584092e-05, "loss": 0.0012, "num_tokens": 26318137.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5823648773289061, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.09566710283979774, "learning_rate": 1.7857876396844123e-05, "loss": 0.0038, "num_tokens": 26323008.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 489.75, "completions/mean_terminated_length": 489.75, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.5825493451392731, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.04455329128541052, "learning_rate": 1.7855884533451785e-05, "loss": 0.0018, "num_tokens": 26332438.0, "reward": 1.7083333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 607.375, "completions/mean_terminated_length": 607.375, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.5827338129496403, "frac_reward_zero_std": 0.0, "grad_norm": 0.60546875, "kl": 0.03936332673765719, "learning_rate": 1.7853891855613578e-05, "loss": 0.0016, "num_tokens": 26345865.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5829182807600074, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.07044606003910303, "learning_rate": 1.7851898363536094e-05, "loss": 0.0028, "num_tokens": 26351505.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5831027485703745, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.098426828160882, "learning_rate": 1.7849904057426006e-05, "loss": 0.0039, "num_tokens": 26357004.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5832872163807415, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.07178407348692417, "learning_rate": 1.7847908937490067e-05, "loss": 0.0029, "num_tokens": 26361692.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5834716841911086, "frac_reward_zero_std": 1.0, "grad_norm": 0.376953125, "kl": 0.05240453558508307, "learning_rate": 1.784591300393512e-05, "loss": 0.0021, "num_tokens": 26370769.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5836561520014757, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.06555538438260555, "learning_rate": 1.7843916256968082e-05, "loss": 0.0026, "num_tokens": 26377651.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 468.75, "completions/mean_terminated_length": 468.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5838406198118429, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.02451432344969362, "learning_rate": 1.7841918696795976e-05, "loss": 0.001, "num_tokens": 26387345.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.58402508762221, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.028775631450116634, "learning_rate": 1.7839920323625888e-05, "loss": 0.0012, "num_tokens": 26392581.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 846.125, "completions/mean_terminated_length": 846.125, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 0.584209555432577, "frac_reward_zero_std": 0.0, "grad_norm": 0.482421875, "kl": 0.01883645763155073, "learning_rate": 1.7837921137665e-05, "loss": 0.0008, "num_tokens": 26409894.0, "reward": 1.4166667461395264, "reward_std": 0.6606874465942383, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5843940232429441, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.05124183953739703, "learning_rate": 1.7835921139120568e-05, "loss": 0.002, "num_tokens": 26420322.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5845784910533112, "frac_reward_zero_std": 1.0, "grad_norm": 0.52734375, "kl": 0.12231877679005265, "learning_rate": 1.783392032819994e-05, "loss": 0.0049, "num_tokens": 26424297.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5847629588636782, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.052390101831406355, "learning_rate": 1.7831918705110555e-05, "loss": 0.0021, "num_tokens": 26432985.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.5849474266740454, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712890625, "kl": 0.0412308715749532, "learning_rate": 1.782991627005992e-05, "loss": 0.0016, "num_tokens": 26441728.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5851318944844125, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.06305244262330234, "learning_rate": 1.782791302325563e-05, "loss": 0.0025, "num_tokens": 26445910.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5853163622947796, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.06389524089172482, "learning_rate": 1.7825908964905378e-05, "loss": 0.0026, "num_tokens": 26454762.0, "reward": 1.059999942779541, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.05999999865889549, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 406.625, "completions/mean_terminated_length": 406.625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.5855008301051466, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203125, "kl": 0.027664771070703864, "learning_rate": 1.782390409521693e-05, "loss": 0.0011, "num_tokens": 26467255.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 575.375, "completions/mean_terminated_length": 575.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5856852979155137, "frac_reward_zero_std": 1.0, "grad_norm": 0.05078125, "kl": 0.025251635815948248, "learning_rate": 1.7821898414398134e-05, "loss": 0.001, "num_tokens": 26476722.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5858697657258808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.028402433032169938, "learning_rate": 1.7819891922656924e-05, "loss": 0.0011, "num_tokens": 26483737.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 719.0, "completions/mean_terminated_length": 719.0, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.586054233536248, "frac_reward_zero_std": 0.0, "grad_norm": 0.6171875, "kl": 0.021255666739307344, "learning_rate": 1.7817884620201326e-05, "loss": 0.0009, "num_tokens": 26499049.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.586238701346615, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.048078659805469215, "learning_rate": 1.7815876507239437e-05, "loss": 0.0019, "num_tokens": 26507977.0, "reward": 1.587837815284729, "reward_std": 0.06420189887285233, "rewards/fixed_code_pass_all_test_reward/mean": 0.587837815284729, "rewards/fixed_code_pass_all_test_reward/std": 0.06420188397169113, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.5864231691569821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0478515625, "kl": 0.02891356567852199, "learning_rate": 1.7813867583979454e-05, "loss": 0.0012, "num_tokens": 26517059.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5866076369673492, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.03221922158263624, "learning_rate": 1.781185785062964e-05, "loss": 0.0013, "num_tokens": 26526917.0, "reward": 1.3017240762710571, "reward_std": 0.11173688620328903, "rewards/fixed_code_pass_all_test_reward/mean": 0.3017241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.11173690110445023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5867921047777163, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.04003114975057542, "learning_rate": 1.7809847307398352e-05, "loss": 0.0016, "num_tokens": 26531963.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 633.25, "completions/mean_terminated_length": 431.14288330078125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5869765725880833, "frac_reward_zero_std": 0.0, "grad_norm": 0.4765625, "kl": 0.027903419570066035, "learning_rate": 1.7807835954494033e-05, "loss": 0.0011, "num_tokens": 26544885.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.5871610403984505, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.06207381980493665, "learning_rate": 1.7805823792125206e-05, "loss": 0.0025, "num_tokens": 26553954.0, "reward": 1.7904411554336548, "reward_std": 0.39754486083984375, "rewards/fixed_code_pass_all_test_reward/mean": 0.7904411554336548, "rewards/fixed_code_pass_all_test_reward/std": 0.39754483103752136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.5873455082088176, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.03612837055698037, "learning_rate": 1.7803810820500475e-05, "loss": 0.0014, "num_tokens": 26562666.0, "reward": 1.8541666269302368, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 494.75, "completions/mean_terminated_length": 494.75, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.5875299760191847, "frac_reward_zero_std": 0.0, "grad_norm": 0.78125, "kl": 0.030450191698037088, "learning_rate": 1.7801797039828534e-05, "loss": 0.0012, "num_tokens": 26571368.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5877144438295517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322265625, "kl": 0.02757272101007402, "learning_rate": 1.7799782450318164e-05, "loss": 0.0011, "num_tokens": 26579849.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5878989116399188, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.04705683677457273, "learning_rate": 1.7797767052178216e-05, "loss": 0.0019, "num_tokens": 26587089.0, "reward": 1.9464285373687744, "reward_std": 0.1515229046344757, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5880833794502859, "frac_reward_zero_std": 1.0, "grad_norm": 0.0419921875, "kl": 0.037489519687369466, "learning_rate": 1.7795750845617633e-05, "loss": 0.0015, "num_tokens": 26594543.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5882678472606531, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.03123515483457595, "learning_rate": 1.779373383084545e-05, "loss": 0.0012, "num_tokens": 26601712.0, "reward": 1.8645833730697632, "reward_std": 0.24372072517871857, "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.24372074007987976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5884523150710201, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.04426570073701441, "learning_rate": 1.7791716008070772e-05, "loss": 0.0018, "num_tokens": 26610691.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5886367828813872, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.020781970990356058, "learning_rate": 1.7789697377502793e-05, "loss": 0.0008, "num_tokens": 26616445.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5888212506917543, "frac_reward_zero_std": 1.0, "grad_norm": 0.27734375, "kl": 0.08586529735475779, "learning_rate": 1.778767793935079e-05, "loss": 0.0034, "num_tokens": 26622530.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 267.5, "completions/mean_terminated_length": 267.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5890057185021214, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.027313244179822505, "learning_rate": 1.778565769382413e-05, "loss": 0.0011, "num_tokens": 26634358.0, "reward": 1.3602941036224365, "reward_std": 0.22366374731063843, "rewards/fixed_code_pass_all_test_reward/mean": 0.3602941334247589, "rewards/fixed_code_pass_all_test_reward/std": 0.22366376221179962, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5891901863124884, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.05368939717300236, "learning_rate": 1.7783636641132257e-05, "loss": 0.0021, "num_tokens": 26642292.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5893746541228556, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.04911674140021205, "learning_rate": 1.7781614781484697e-05, "loss": 0.002, "num_tokens": 26651126.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.5895591219332227, "frac_reward_zero_std": 1.0, "grad_norm": 0.042236328125, "kl": 0.031151547096669674, "learning_rate": 1.7779592115091066e-05, "loss": 0.0012, "num_tokens": 26658758.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5897435897435898, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.05404496344272047, "learning_rate": 1.7777568642161058e-05, "loss": 0.0022, "num_tokens": 26669084.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5899280575539568, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.0564572699368, "learning_rate": 1.7775544362904456e-05, "loss": 0.0023, "num_tokens": 26676502.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 223.125, "completions/mean_terminated_length": 223.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.5901125253643239, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.0726684033870697, "learning_rate": 1.777351927753112e-05, "loss": 0.0029, "num_tokens": 26684007.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.590296993174691, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.05096497084014118, "learning_rate": 1.7771493386251007e-05, "loss": 0.002, "num_tokens": 26692159.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 471.875, "completions/mean_terminated_length": 471.875, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.5904814609850582, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.046799191273748875, "learning_rate": 1.776946668927413e-05, "loss": 0.0019, "num_tokens": 26701598.0, "reward": 1.7833333015441895, "reward_std": 0.23162643611431122, "rewards/fixed_code_pass_all_test_reward/mean": 0.7833333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.23162642121315002, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 671.5, "completions/mean_terminated_length": 671.5, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.5906659287954252, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.03589981538243592, "learning_rate": 1.7767439186810628e-05, "loss": 0.0014, "num_tokens": 26717474.0, "reward": 1.9791667461395264, "reward_std": 0.017251623794436455, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.017251653596758842, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5908503966057923, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.053446023957803845, "learning_rate": 1.7765410879070676e-05, "loss": 0.0021, "num_tokens": 26726083.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5910348644161594, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.0513851591385901, "learning_rate": 1.7763381766264566e-05, "loss": 0.0021, "num_tokens": 26732065.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 339.625, "completions/mean_terminated_length": 339.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5912193322265265, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.07344930665567517, "learning_rate": 1.7761351848602664e-05, "loss": 0.0029, "num_tokens": 26741958.0, "reward": 1.850000023841858, "reward_std": 0.2777460217475891, "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 323.875, "completions/mean_terminated_length": 323.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.5914038000368935, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.04352594749070704, "learning_rate": 1.7759321126295413e-05, "loss": 0.0017, "num_tokens": 26752981.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5915882678472607, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03694290772546083, "learning_rate": 1.7757289599553353e-05, "loss": 0.0015, "num_tokens": 26758395.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 235.375, "completions/mean_terminated_length": 235.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5917727356576278, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.028339531272649765, "learning_rate": 1.7755257268587095e-05, "loss": 0.0011, "num_tokens": 26766158.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5919572034679949, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.05185215000528842, "learning_rate": 1.7753224133607332e-05, "loss": 0.0021, "num_tokens": 26772098.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.5921416712783619, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.06000466225668788, "learning_rate": 1.7751190194824852e-05, "loss": 0.0024, "num_tokens": 26781813.0, "reward": 1.744949460029602, "reward_std": 0.4536868631839752, "rewards/fixed_code_pass_all_test_reward/mean": 0.744949460029602, "rewards/fixed_code_pass_all_test_reward/std": 0.45368683338165283, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 400.25, "completions/mean_terminated_length": 400.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.592326139088729, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.040871202014386654, "learning_rate": 1.774915545245052e-05, "loss": 0.0016, "num_tokens": 26788655.0, "reward": 1.5750000476837158, "reward_std": 0.31052953004837036, "rewards/fixed_code_pass_all_test_reward/mean": 0.574999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5925106068990961, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.05978077370673418, "learning_rate": 1.7747119906695282e-05, "loss": 0.0024, "num_tokens": 26794626.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5926950747094631, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.09818722493946552, "learning_rate": 1.774508355777017e-05, "loss": 0.0039, "num_tokens": 26798676.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5928795425198303, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.10449459427036345, "learning_rate": 1.77430464058863e-05, "loss": 0.0042, "num_tokens": 26805852.0, "reward": 1.7589285373687744, "reward_std": 0.446785569190979, "rewards/fixed_code_pass_all_test_reward/mean": 0.8839285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.32829955220222473, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5930640103301974, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.052320567425340414, "learning_rate": 1.774100845125488e-05, "loss": 0.0021, "num_tokens": 26810291.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5932484781405645, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.059235129272565246, "learning_rate": 1.7738969694087172e-05, "loss": 0.0024, "num_tokens": 26819080.0, "reward": 1.7541667222976685, "reward_std": 0.12321922928094864, "rewards/fixed_code_pass_all_test_reward/mean": 0.7541667222976685, "rewards/fixed_code_pass_all_test_reward/std": 0.12321923673152924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5934329459509315, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.04821228561922908, "learning_rate": 1.7736930134594553e-05, "loss": 0.0019, "num_tokens": 26823315.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5936174137612986, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.04046152252703905, "learning_rate": 1.7734889772988473e-05, "loss": 0.0016, "num_tokens": 26836471.0, "reward": 1.6551724672317505, "reward_std": 0.4235307574272156, "rewards/fixed_code_pass_all_test_reward/mean": 0.6551724076271057, "rewards/fixed_code_pass_all_test_reward/std": 0.4235307276248932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 419.875, "completions/mean_terminated_length": 419.875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5938018815716657, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.051053815986961126, "learning_rate": 1.7732848609480455e-05, "loss": 0.002, "num_tokens": 26849046.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5939863493820329, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.049739392241463065, "learning_rate": 1.773080664428212e-05, "loss": 0.002, "num_tokens": 26857050.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5941708171924, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.09605036024004221, "learning_rate": 1.7728763877605162e-05, "loss": 0.0038, "num_tokens": 26862366.0, "reward": 1.5, "reward_std": 0.20616242289543152, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.2061624377965927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.594355285002767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.06136267213150859, "learning_rate": 1.7726720309661363e-05, "loss": 0.0025, "num_tokens": 26868814.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5945397528131341, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.07893473096191883, "learning_rate": 1.7724675940662585e-05, "loss": 0.0032, "num_tokens": 26880812.0, "reward": 1.600806474685669, "reward_std": 0.20103605091571808, "rewards/fixed_code_pass_all_test_reward/mean": 0.6008064150810242, "rewards/fixed_code_pass_all_test_reward/std": 0.20103605091571808, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 351.75, "completions/mean_terminated_length": 351.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5947242206235012, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.05931929824873805, "learning_rate": 1.7722630770820776e-05, "loss": 0.0024, "num_tokens": 26889122.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.5949086884338682, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.07043121685273945, "learning_rate": 1.7720584800347965e-05, "loss": 0.0028, "num_tokens": 26900305.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 334.125, "completions/mean_terminated_length": 334.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5950931562442354, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.08597835060209036, "learning_rate": 1.7718538029456266e-05, "loss": 0.0034, "num_tokens": 26909730.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 506.875, "completions/mean_terminated_length": 506.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5952776240546025, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.049699308816343546, "learning_rate": 1.7716490458357868e-05, "loss": 0.002, "num_tokens": 26922849.0, "reward": 1.9872881174087524, "reward_std": 0.035954609513282776, "rewards/fixed_code_pass_all_test_reward/mean": 0.9872881174087524, "rewards/fixed_code_pass_all_test_reward/std": 0.035954590886831284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5954620918649696, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.0412621502764523, "learning_rate": 1.7714442087265064e-05, "loss": 0.0017, "num_tokens": 26932810.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.5956465596753366, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.02958440571092069, "learning_rate": 1.77123929163902e-05, "loss": 0.0012, "num_tokens": 26944819.0, "reward": 1.295454502105713, "reward_std": 0.06428244709968567, "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5958310274857037, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.045489123091101646, "learning_rate": 1.7710342945945725e-05, "loss": 0.0018, "num_tokens": 26951490.0, "reward": 1.4375, "reward_std": 0.2825268805027008, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.2825268805027008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5960154952960708, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.040379425743594766, "learning_rate": 1.7708292176144173e-05, "loss": 0.0016, "num_tokens": 26960525.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 415.5, "completions/mean_terminated_length": 415.5, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.596199963106438, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.04919778439216316, "learning_rate": 1.7706240607198143e-05, "loss": 0.002, "num_tokens": 26972201.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.596384430916805, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.09180603828281164, "learning_rate": 1.7704188239320343e-05, "loss": 0.0037, "num_tokens": 26976211.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.5965688987271721, "frac_reward_zero_std": 1.0, "grad_norm": 0.03662109375, "kl": 0.020150380791164935, "learning_rate": 1.7702135072723532e-05, "loss": 0.0008, "num_tokens": 26981749.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 427.75, "completions/mean_terminated_length": 427.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.5967533665375392, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.053825351991690695, "learning_rate": 1.7700081107620582e-05, "loss": 0.0022, "num_tokens": 26991763.0, "reward": 1.1363636255264282, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 469.625, "completions/mean_terminated_length": 469.625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.5969378343479063, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.04426286509260535, "learning_rate": 1.7698026344224425e-05, "loss": 0.0018, "num_tokens": 27007272.0, "reward": 1.90625, "reward_std": 0.18600594997406006, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.18600596487522125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5971223021582733, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.04161699931137264, "learning_rate": 1.7695970782748092e-05, "loss": 0.0017, "num_tokens": 27020794.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 491.375, "completions/mean_terminated_length": 491.375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.5973067699686405, "frac_reward_zero_std": 0.0, "grad_norm": 0.75390625, "kl": 0.04408438364043832, "learning_rate": 1.7693914423404687e-05, "loss": 0.0018, "num_tokens": 27030693.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 262.75, "completions/mean_terminated_length": 262.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5974912377790076, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.058891989290714264, "learning_rate": 1.7691857266407398e-05, "loss": 0.0024, "num_tokens": 27036475.0, "reward": 1.649999976158142, "reward_std": 0.3767024576663971, "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.3767024874687195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.5976757055893747, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.05321342567913234, "learning_rate": 1.76897993119695e-05, "loss": 0.0021, "num_tokens": 27040519.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.5978601733997417, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.13785012532025576, "learning_rate": 1.7687740560304352e-05, "loss": 0.0055, "num_tokens": 27044954.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.5980446412101088, "frac_reward_zero_std": 0.0, "grad_norm": 3.3125, "kl": 0.3188199058640748, "learning_rate": 1.7685681011625382e-05, "loss": 0.0128, "num_tokens": 27049350.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 569.875, "completions/mean_terminated_length": 569.875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.5982291090204759, "frac_reward_zero_std": 1.0, "grad_norm": 0.3671875, "kl": 0.050047642551362514, "learning_rate": 1.7683620666146116e-05, "loss": 0.002, "num_tokens": 27061109.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5984135768308431, "frac_reward_zero_std": 1.0, "grad_norm": 0.46875, "kl": 0.055553977843374014, "learning_rate": 1.768155952408016e-05, "loss": 0.0022, "num_tokens": 27065253.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5985980446412101, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.05457189166918397, "learning_rate": 1.7679497585641193e-05, "loss": 0.0022, "num_tokens": 27069636.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5987825124515772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.05175937106832862, "learning_rate": 1.7677434851042985e-05, "loss": 0.0021, "num_tokens": 27078277.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 100.125, "completions/mean_terminated_length": 100.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5989669802619443, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.07112702820450068, "learning_rate": 1.767537132049939e-05, "loss": 0.0028, "num_tokens": 27081798.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 446.125, "completions/mean_terminated_length": 446.125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.5991514480723114, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.03650404326617718, "learning_rate": 1.7673306994224335e-05, "loss": 0.0015, "num_tokens": 27090743.0, "reward": 1.7678570747375488, "reward_std": 0.3576526939868927, "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.3576527237892151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 304.75, "completions/mean_terminated_length": 304.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5993359158826784, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.045535961631685495, "learning_rate": 1.767124187243184e-05, "loss": 0.0018, "num_tokens": 27097405.0, "reward": 1.2127659320831299, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.21276596188545227, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5995203836930456, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.1333378052804619, "learning_rate": 1.7669175955336008e-05, "loss": 0.0053, "num_tokens": 27102156.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5997048515034127, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.06703867088072002, "learning_rate": 1.7667109243151006e-05, "loss": 0.0027, "num_tokens": 27108330.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5998893193137798, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.028717777808196843, "learning_rate": 1.766504173609111e-05, "loss": 0.0011, "num_tokens": 27113139.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 407.25, "completions/mean_terminated_length": 407.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6000737871241468, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.03851071931421757, "learning_rate": 1.766297343437066e-05, "loss": 0.0015, "num_tokens": 27120821.0, "reward": 1.6764705181121826, "reward_std": 0.05445994809269905, "rewards/fixed_code_pass_all_test_reward/mean": 0.6764706373214722, "rewards/fixed_code_pass_all_test_reward/std": 0.05446000397205353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6002582549345139, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.060590768698602915, "learning_rate": 1.7660904338204077e-05, "loss": 0.0024, "num_tokens": 27127274.0, "reward": 1.8031914234161377, "reward_std": 0.36441856622695923, "rewards/fixed_code_pass_all_test_reward/mean": 0.8031914830207825, "rewards/fixed_code_pass_all_test_reward/std": 0.36441853642463684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 626.875, "completions/mean_terminated_length": 626.875, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.600442722744881, "frac_reward_zero_std": 0.0, "grad_norm": 0.58203125, "kl": 0.024945134646259248, "learning_rate": 1.7658834447805886e-05, "loss": 0.001, "num_tokens": 27143665.0, "reward": 1.9642857313156128, "reward_std": 0.06612997502088547, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.06613000482320786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6006271905552482, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.05722511187195778, "learning_rate": 1.765676376339067e-05, "loss": 0.0023, "num_tokens": 27154068.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.6008116583656152, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.03862951137125492, "learning_rate": 1.7654692285173103e-05, "loss": 0.0015, "num_tokens": 27159534.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6009961261759823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.05706696794368327, "learning_rate": 1.7652620013367944e-05, "loss": 0.0023, "num_tokens": 27167953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6011805939863494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.056513707619160414, "learning_rate": 1.7650546948190036e-05, "loss": 0.0023, "num_tokens": 27177382.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6013650617967164, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.03343645087443292, "learning_rate": 1.7648473089854293e-05, "loss": 0.0013, "num_tokens": 27181753.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6015495296070835, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.06353238178417087, "learning_rate": 1.764639843857573e-05, "loss": 0.0025, "num_tokens": 27187848.0, "reward": 1.774999976158142, "reward_std": 0.4200340211391449, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6017339974174507, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.10659371362999082, "learning_rate": 1.7644322994569424e-05, "loss": 0.0043, "num_tokens": 27196611.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6019184652278178, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06496565137058496, "learning_rate": 1.764224675805054e-05, "loss": 0.0026, "num_tokens": 27202859.0, "reward": 1.7088816165924072, "reward_std": 0.19910727441310883, "rewards/fixed_code_pass_all_test_reward/mean": 0.7088816165924072, "rewards/fixed_code_pass_all_test_reward/std": 0.1991073042154312, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 409.75, "completions/mean_terminated_length": 409.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.6021029330381849, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.07596419425681233, "learning_rate": 1.764016972923434e-05, "loss": 0.003, "num_tokens": 27213065.0, "reward": 1.8899999856948853, "reward_std": 0.3111270070075989, "rewards/fixed_code_pass_all_test_reward/mean": 0.8899999856948853, "rewards/fixed_code_pass_all_test_reward/std": 0.3111269772052765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 338.25, "completions/mean_terminated_length": 338.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6022874008485519, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.06824757833965123, "learning_rate": 1.763809190833615e-05, "loss": 0.0027, "num_tokens": 27222955.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 420.375, "completions/mean_terminated_length": 420.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.602471868658919, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.06158673297613859, "learning_rate": 1.7636013295571382e-05, "loss": 0.0025, "num_tokens": 27234758.0, "reward": 1.4722222089767456, "reward_std": 0.6417293548583984, "rewards/fixed_code_pass_all_test_reward/mean": 0.5972222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.34085431694984436, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.6026563364692861, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.026337135583162308, "learning_rate": 1.7633933891155538e-05, "loss": 0.0011, "num_tokens": 27242280.0, "reward": 1.4973958730697632, "reward_std": 0.14323261380195618, "rewards/fixed_code_pass_all_test_reward/mean": 0.4973958432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.1432325690984726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 585.75, "completions/mean_terminated_length": 585.75, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.6028408042796533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.03041464532725513, "learning_rate": 1.7631853695304194e-05, "loss": 0.0012, "num_tokens": 27253550.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 296.75, "completions/mean_terminated_length": 296.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6030252720900203, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.06288192025385797, "learning_rate": 1.762977270823301e-05, "loss": 0.0025, "num_tokens": 27262964.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 557.5, "completions/mean_terminated_length": 557.5, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.6032097399003874, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.03864532336592674, "learning_rate": 1.762769093015773e-05, "loss": 0.0015, "num_tokens": 27276032.0, "reward": 0.4166666865348816, "reward_std": 0.7715167999267578, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4629100561141968, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6033942077107545, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.04162590950727463, "learning_rate": 1.7625608361294183e-05, "loss": 0.0017, "num_tokens": 27286709.0, "reward": 1.85326087474823, "reward_std": 0.2288147658109665, "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, "rewards/fixed_code_pass_all_test_reward/std": 0.22881478071212769, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6035786755211215, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.04989181808196008, "learning_rate": 1.762352500185827e-05, "loss": 0.002, "num_tokens": 27296772.0, "reward": 1.811274528503418, "reward_std": 0.3732026517391205, "rewards/fixed_code_pass_all_test_reward/mean": 0.811274528503418, "rewards/fixed_code_pass_all_test_reward/std": 0.3732026517391205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 563.125, "completions/mean_terminated_length": 563.125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.6037631433314886, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.03178279101848602, "learning_rate": 1.7621440852065986e-05, "loss": 0.0013, "num_tokens": 27307189.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6039476111418558, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.07737881364300847, "learning_rate": 1.7619355912133395e-05, "loss": 0.0031, "num_tokens": 27314537.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6041320789522229, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04525035014376044, "learning_rate": 1.7617270182276655e-05, "loss": 0.0018, "num_tokens": 27320544.0, "reward": 1.5257353782653809, "reward_std": 0.25948992371559143, "rewards/fixed_code_pass_all_test_reward/mean": 0.5257353186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.25948992371559143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.60431654676259, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.06083514983765781, "learning_rate": 1.7615183662711992e-05, "loss": 0.0024, "num_tokens": 27331200.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.604501014572957, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.06710966257378459, "learning_rate": 1.7613096353655735e-05, "loss": 0.0027, "num_tokens": 27337254.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6046854823833241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.03686751937493682, "learning_rate": 1.7611008255324272e-05, "loss": 0.0015, "num_tokens": 27346163.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 405.75, "completions/mean_terminated_length": 405.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6048699501936912, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.06287900824099779, "learning_rate": 1.760891936793409e-05, "loss": 0.0025, "num_tokens": 27356121.0, "reward": 1.6336207389831543, "reward_std": 0.21126903593540192, "rewards/fixed_code_pass_all_test_reward/mean": 0.6336207389831543, "rewards/fixed_code_pass_all_test_reward/std": 0.21126903593540192, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 509.625, "completions/mean_terminated_length": 289.8571472167969, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6050544180040582, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.057863643509335816, "learning_rate": 1.7606829691701746e-05, "loss": 0.0023, "num_tokens": 27368734.0, "reward": 1.329545497894287, "reward_std": 0.6325255036354065, "rewards/fixed_code_pass_all_test_reward/mean": 0.5795454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.3746308982372284, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6052388858144254, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.040455823531374335, "learning_rate": 1.7604739226843884e-05, "loss": 0.0016, "num_tokens": 27376488.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6054233536247925, "frac_reward_zero_std": 1.0, "grad_norm": 0.51171875, "kl": 0.0893332022242248, "learning_rate": 1.7602647973577235e-05, "loss": 0.0036, "num_tokens": 27384842.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6056078214351596, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0565935664344579, "learning_rate": 1.7600555932118602e-05, "loss": 0.0023, "num_tokens": 27394950.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6057922892455266, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.05287824268452823, "learning_rate": 1.7598463102684872e-05, "loss": 0.0021, "num_tokens": 27403081.0, "reward": 1.4675325155258179, "reward_std": 0.33320876955986023, "rewards/fixed_code_pass_all_test_reward/mean": 0.4675324559211731, "rewards/fixed_code_pass_all_test_reward/std": 0.3332087993621826, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 401.5, "completions/mean_terminated_length": 401.5, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.6059767570558937, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.02626150520518422, "learning_rate": 1.7596369485493022e-05, "loss": 0.0011, "num_tokens": 27410733.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6061612248662608, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.09091789182275534, "learning_rate": 1.75942750807601e-05, "loss": 0.0036, "num_tokens": 27419222.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.606345692676628, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.07641315320506692, "learning_rate": 1.7592179888703234e-05, "loss": 0.0031, "num_tokens": 27432169.0, "reward": 1.5969388484954834, "reward_std": 0.36192142963409424, "rewards/fixed_code_pass_all_test_reward/mean": 0.7219387888908386, "rewards/fixed_code_pass_all_test_reward/std": 0.18153518438339233, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 348.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.606530160486995, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.04969023144803941, "learning_rate": 1.7590083909539655e-05, "loss": 0.002, "num_tokens": 27441831.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6067146282973621, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.04731839359737933, "learning_rate": 1.7587987143486645e-05, "loss": 0.0019, "num_tokens": 27448428.0, "reward": 1.795212745666504, "reward_std": 0.08274652063846588, "rewards/fixed_code_pass_all_test_reward/mean": 0.7952128052711487, "rewards/fixed_code_pass_all_test_reward/std": 0.08274652808904648, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6068990961077292, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.02953702607192099, "learning_rate": 1.758588959076159e-05, "loss": 0.0012, "num_tokens": 27457955.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6070835639180963, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.02751934784464538, "learning_rate": 1.7583791251581953e-05, "loss": 0.0011, "num_tokens": 27466432.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 325.125, "completions/mean_terminated_length": 325.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6072680317284633, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.0611198334954679, "learning_rate": 1.758169212616527e-05, "loss": 0.0024, "num_tokens": 27476281.0, "reward": 1.2553191184997559, "reward_std": 0.09914592653512955, "rewards/fixed_code_pass_all_test_reward/mean": 0.25531914830207825, "rewards/fixed_code_pass_all_test_reward/std": 0.09914593398571014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6074524995388305, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.05263520427979529, "learning_rate": 1.7579592214729166e-05, "loss": 0.0021, "num_tokens": 27485024.0, "reward": 1.9393939971923828, "reward_std": 0.17141982913017273, "rewards/fixed_code_pass_all_test_reward/mean": 0.939393937587738, "rewards/fixed_code_pass_all_test_reward/std": 0.17141982913017273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6076369673491976, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.025855178595520556, "learning_rate": 1.7577491517491345e-05, "loss": 0.001, "num_tokens": 27491426.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6078214351595647, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.06575867533683777, "learning_rate": 1.7575390034669594e-05, "loss": 0.0026, "num_tokens": 27496008.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 251.25, "completions/mean_terminated_length": 251.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6080059029699317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.05254435818642378, "learning_rate": 1.7573287766481785e-05, "loss": 0.0021, "num_tokens": 27504210.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.6081903707802988, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.04450291907414794, "learning_rate": 1.7571184713145863e-05, "loss": 0.0018, "num_tokens": 27512144.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.6083748385906659, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04374199407175183, "learning_rate": 1.7569080874879856e-05, "loss": 0.0017, "num_tokens": 27519968.0, "reward": 1.2426470518112183, "reward_std": 0.23621143400669098, "rewards/fixed_code_pass_all_test_reward/mean": 0.36764705181121826, "rewards/fixed_code_pass_all_test_reward/std": 0.24707883596420288, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6085593064010331, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.03455950319766998, "learning_rate": 1.756697625190188e-05, "loss": 0.0014, "num_tokens": 27526633.0, "reward": 1.83695650100708, "reward_std": 0.10063259303569794, "rewards/fixed_code_pass_all_test_reward/mean": 0.8369565010070801, "rewards/fixed_code_pass_all_test_reward/std": 0.10063262283802032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.6087437742114001, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.029937791405245662, "learning_rate": 1.756487084443013e-05, "loss": 0.0012, "num_tokens": 27530909.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6089282420217672, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.0454005200881511, "learning_rate": 1.7562764652682876e-05, "loss": 0.0018, "num_tokens": 27538278.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6091127098321343, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0438581247581169, "learning_rate": 1.7560657676878477e-05, "loss": 0.0018, "num_tokens": 27547943.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6092971776425014, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.03327989799436182, "learning_rate": 1.755854991723537e-05, "loss": 0.0013, "num_tokens": 27552782.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.6094816454528684, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.04942752467468381, "learning_rate": 1.7556441373972072e-05, "loss": 0.002, "num_tokens": 27564175.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 435.125, "completions/mean_terminated_length": 435.125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.6096661132632356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.024697610642760992, "learning_rate": 1.7554332047307183e-05, "loss": 0.001, "num_tokens": 27573496.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6098505810736027, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.04457639798056334, "learning_rate": 1.7552221937459385e-05, "loss": 0.0018, "num_tokens": 27578282.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.6100350488839698, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.07007634686306119, "learning_rate": 1.755011104464744e-05, "loss": 0.0028, "num_tokens": 27586295.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 493.625, "completions/mean_terminated_length": 271.5714416503906, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.6102195166943368, "frac_reward_zero_std": 0.0, "grad_norm": 0.4453125, "kl": 0.017332305782474577, "learning_rate": 1.754799936909019e-05, "loss": 0.0007, "num_tokens": 27593684.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.6104039845047039, "frac_reward_zero_std": 1.0, "grad_norm": 0.049072265625, "kl": 0.029942544177174568, "learning_rate": 1.7545886911006563e-05, "loss": 0.0012, "num_tokens": 27603265.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.610588452315071, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.07714909967035055, "learning_rate": 1.7543773670615557e-05, "loss": 0.0031, "num_tokens": 27610147.0, "reward": 1.3256173133850098, "reward_std": 0.2386571317911148, "rewards/fixed_code_pass_all_test_reward/mean": 0.3256172835826874, "rewards/fixed_code_pass_all_test_reward/std": 0.2386571615934372, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6107729201254382, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.10459733661264181, "learning_rate": 1.754165964813627e-05, "loss": 0.0042, "num_tokens": 27617367.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6109573879358052, "frac_reward_zero_std": 1.0, "grad_norm": 0.057861328125, "kl": 0.035625193966552615, "learning_rate": 1.753954484378786e-05, "loss": 0.0014, "num_tokens": 27627240.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 185.375, "completions/mean_terminated_length": 185.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6111418557461723, "frac_reward_zero_std": 1.0, "grad_norm": 0.04150390625, "kl": 0.02185181574895978, "learning_rate": 1.753742925778958e-05, "loss": 0.0009, "num_tokens": 27631795.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6113263235565394, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.06494635296985507, "learning_rate": 1.7535312890360757e-05, "loss": 0.0026, "num_tokens": 27640519.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 497.25, "completions/mean_terminated_length": 497.25, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.6115107913669064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.04333417466841638, "learning_rate": 1.753319574172081e-05, "loss": 0.0017, "num_tokens": 27650369.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6116952591772735, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.04373086185660213, "learning_rate": 1.7531077812089222e-05, "loss": 0.0017, "num_tokens": 27659453.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6118797269876407, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541015625, "kl": 0.02060942817479372, "learning_rate": 1.752895910168557e-05, "loss": 0.0008, "num_tokens": 27665242.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 218.5, "completions/mean_terminated_length": 218.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6120641947980078, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.0870751915499568, "learning_rate": 1.75268396107295e-05, "loss": 0.0035, "num_tokens": 27673462.0, "reward": 1.0555555820465088, "reward_std": 0.05939141660928726, "rewards/fixed_code_pass_all_test_reward/mean": 0.0555555559694767, "rewards/fixed_code_pass_all_test_reward/std": 0.059391386806964874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6122486626083748, "frac_reward_zero_std": 0.0, "grad_norm": 14.4375, "kl": 0.08483859500847757, "learning_rate": 1.752471933944076e-05, "loss": 0.0034, "num_tokens": 27677173.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6124331304187419, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.061213011387735605, "learning_rate": 1.752259828803916e-05, "loss": 0.0024, "num_tokens": 27681542.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 482.75, "completions/mean_terminated_length": 482.75, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.612617598229109, "frac_reward_zero_std": 0.0, "grad_norm": 0.5625, "kl": 0.02059119159821421, "learning_rate": 1.752047645674459e-05, "loss": 0.0008, "num_tokens": 27694452.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6128020660394761, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.040000207256525755, "learning_rate": 1.7518353845777038e-05, "loss": 0.0016, "num_tokens": 27698613.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6129865338498433, "frac_reward_zero_std": 1.0, "grad_norm": 0.054443359375, "kl": 0.05184913636185229, "learning_rate": 1.7516230455356556e-05, "loss": 0.0021, "num_tokens": 27707823.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 500.625, "completions/mean_terminated_length": 279.5714416503906, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6131710016602103, "frac_reward_zero_std": 0.0, "grad_norm": 0.64453125, "kl": 0.04622071864287136, "learning_rate": 1.7514106285703283e-05, "loss": 0.0018, "num_tokens": 27719348.0, "reward": 1.3660714626312256, "reward_std": 0.5917157530784607, "rewards/fixed_code_pass_all_test_reward/mean": 0.4910714328289032, "rewards/fixed_code_pass_all_test_reward/std": 0.29124119877815247, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6133554694705774, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.07622893061488867, "learning_rate": 1.751198133703744e-05, "loss": 0.003, "num_tokens": 27727626.0, "reward": 1.798076868057251, "reward_std": 0.3761429488658905, "rewards/fixed_code_pass_all_test_reward/mean": 0.7980769276618958, "rewards/fixed_code_pass_all_test_reward/std": 0.3761429488658905, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6135399372809445, "frac_reward_zero_std": 1.0, "grad_norm": 0.26171875, "kl": 0.07414495339617133, "learning_rate": 1.7509855609579332e-05, "loss": 0.003, "num_tokens": 27731937.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 232.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6137244050913115, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.02651358162984252, "learning_rate": 1.7507729103549328e-05, "loss": 0.0011, "num_tokens": 27737899.0, "reward": 1.9711538553237915, "reward_std": 0.0815892145037651, "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.0815892368555069, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.6139088729016786, "frac_reward_zero_std": 1.0, "grad_norm": 0.36328125, "kl": 0.09215324791148305, "learning_rate": 1.7505601819167904e-05, "loss": 0.0037, "num_tokens": 27745019.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6140933407120458, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04909712774679065, "learning_rate": 1.7503473756655594e-05, "loss": 0.002, "num_tokens": 27753138.0, "reward": 1.9764705896377563, "reward_std": 0.06655122339725494, "rewards/fixed_code_pass_all_test_reward/mean": 0.9764705896377563, "rewards/fixed_code_pass_all_test_reward/std": 0.06655122339725494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 422.375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.6142778085224129, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.031882311566732824, "learning_rate": 1.7501344916233027e-05, "loss": 0.0013, "num_tokens": 27761893.0, "reward": 1.8989362716674805, "reward_std": 0.12854301929473877, "rewards/fixed_code_pass_all_test_reward/mean": 0.8989361524581909, "rewards/fixed_code_pass_all_test_reward/std": 0.12854304909706116, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6144622763327799, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.06847328878939152, "learning_rate": 1.74992152981209e-05, "loss": 0.0027, "num_tokens": 27769917.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 603.0, "completions/mean_terminated_length": 603.0, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.614646744143147, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.05860861111432314, "learning_rate": 1.7497084902540004e-05, "loss": 0.0023, "num_tokens": 27785789.0, "reward": 1.1607142686843872, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 476.75, "completions/mean_terminated_length": 476.75, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.6148312119535141, "frac_reward_zero_std": 1.0, "grad_norm": 0.03857421875, "kl": 0.027779660536907613, "learning_rate": 1.74949537297112e-05, "loss": 0.0011, "num_tokens": 27800011.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.6150156797638812, "frac_reward_zero_std": 1.0, "grad_norm": 0.7421875, "kl": 0.09088262682780623, "learning_rate": 1.7492821779855432e-05, "loss": 0.0036, "num_tokens": 27805797.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6152001475742483, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.03620972763746977, "learning_rate": 1.7490689053193735e-05, "loss": 0.0014, "num_tokens": 27813038.0, "reward": 1.584302306175232, "reward_std": 0.13977688550949097, "rewards/fixed_code_pass_all_test_reward/mean": 0.5843023061752319, "rewards/fixed_code_pass_all_test_reward/std": 0.13977693021297455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.6153846153846154, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.028599119978025556, "learning_rate": 1.7488555549947214e-05, "loss": 0.0011, "num_tokens": 27817353.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6155690831949825, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.05023747938685119, "learning_rate": 1.7486421270337047e-05, "loss": 0.002, "num_tokens": 27823536.0, "reward": 1.8125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6157535510053496, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.055596537655219436, "learning_rate": 1.748428621458451e-05, "loss": 0.0022, "num_tokens": 27832094.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6159380188157166, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.03635450708679855, "learning_rate": 1.748215038291095e-05, "loss": 0.0015, "num_tokens": 27841650.0, "reward": 1.875, "reward_std": 0.05050760135054588, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6161224866260837, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.07267401181161404, "learning_rate": 1.7480013775537797e-05, "loss": 0.0029, "num_tokens": 27846993.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 101.0, "completions/mean_terminated_length": 101.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.6163069544364509, "frac_reward_zero_std": 0.0, "grad_norm": 4.03125, "kl": 0.06279017543420196, "learning_rate": 1.7477876392686557e-05, "loss": 0.0025, "num_tokens": 27850513.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 863.5, "completions/mean_terminated_length": 863.5, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 0.616491422246818, "frac_reward_zero_std": 0.0, "grad_norm": 0.57421875, "kl": 0.031212733825668693, "learning_rate": 1.7475738234578822e-05, "loss": 0.0012, "num_tokens": 27869229.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.616675890057185, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.04730022861622274, "learning_rate": 1.747359930143626e-05, "loss": 0.0019, "num_tokens": 27878248.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6168603578675521, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.03963503846898675, "learning_rate": 1.7471459593480626e-05, "loss": 0.0016, "num_tokens": 27884544.0, "reward": 1.9617347717285156, "reward_std": 0.1082305982708931, "rewards/fixed_code_pass_all_test_reward/mean": 0.9617347121238708, "rewards/fixed_code_pass_all_test_reward/std": 0.10823062062263489, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 481.125, "completions/mean_terminated_length": 481.125, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.6170448256779192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.04432960296981037, "learning_rate": 1.746931911093374e-05, "loss": 0.0018, "num_tokens": 27893785.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6172292934882863, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.049275541212409735, "learning_rate": 1.7467177854017528e-05, "loss": 0.002, "num_tokens": 27903111.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6174137612986533, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.06680664815939963, "learning_rate": 1.746503582295397e-05, "loss": 0.0027, "num_tokens": 27915590.0, "reward": 1.7976189851760864, "reward_std": 0.235702246427536, "rewards/fixed_code_pass_all_test_reward/mean": 0.797619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.235702246427536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 186.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6175982291090205, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.03738026041537523, "learning_rate": 1.7462893017965145e-05, "loss": 0.0015, "num_tokens": 27920581.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.6177826969193876, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.14535973500460386, "learning_rate": 1.74607494392732e-05, "loss": 0.0058, "num_tokens": 27924783.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6179671647297547, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.06586991040967405, "learning_rate": 1.7458605087100364e-05, "loss": 0.0026, "num_tokens": 27932991.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6181516325401217, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.11210122145712376, "learning_rate": 1.7456459961668956e-05, "loss": 0.0045, "num_tokens": 27936984.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6183361003504888, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.06594277825206518, "learning_rate": 1.745431406320136e-05, "loss": 0.0026, "num_tokens": 27948458.0, "reward": 1.8509615659713745, "reward_std": 0.3467257022857666, "rewards/fixed_code_pass_all_test_reward/mean": 0.9759615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.04568212106823921, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6185205681608559, "frac_reward_zero_std": 1.0, "grad_norm": 0.055908203125, "kl": 0.043571528512984514, "learning_rate": 1.7452167391920063e-05, "loss": 0.0017, "num_tokens": 27953710.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6187050359712231, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.03385118069127202, "learning_rate": 1.7450019948047606e-05, "loss": 0.0014, "num_tokens": 27964442.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.6188895037815901, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.0467336589936167, "learning_rate": 1.744787173180662e-05, "loss": 0.0019, "num_tokens": 27968295.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6190739715919572, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05532377725467086, "learning_rate": 1.7445722743419826e-05, "loss": 0.0022, "num_tokens": 27975180.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6192584394023243, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.054185253102332354, "learning_rate": 1.7443572983110014e-05, "loss": 0.0022, "num_tokens": 27981386.0, "reward": 1.920454502105713, "reward_std": 0.22498849034309387, "rewards/fixed_code_pass_all_test_reward/mean": 0.9204545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6194429072126914, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.05819185171276331, "learning_rate": 1.744142245110005e-05, "loss": 0.0023, "num_tokens": 27991565.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6196273750230584, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.05045355367474258, "learning_rate": 1.7439271147612897e-05, "loss": 0.002, "num_tokens": 28000981.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 346.375, "completions/mean_terminated_length": 346.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6198118428334256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.05596318026073277, "learning_rate": 1.743711907287158e-05, "loss": 0.0022, "num_tokens": 28011064.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6199963106437927, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.06303555611521006, "learning_rate": 1.743496622709922e-05, "loss": 0.0025, "num_tokens": 28017252.0, "reward": 1.8815789222717285, "reward_std": 0.33494532108306885, "rewards/fixed_code_pass_all_test_reward/mean": 0.8815789222717285, "rewards/fixed_code_pass_all_test_reward/std": 0.33494532108306885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6201807784541598, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.05829335446469486, "learning_rate": 1.7432812610519003e-05, "loss": 0.0023, "num_tokens": 28023619.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6203652462645268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.04785671178251505, "learning_rate": 1.7430658223354203e-05, "loss": 0.0019, "num_tokens": 28031972.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6205497140748939, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0487944211345166, "learning_rate": 1.7428503065828174e-05, "loss": 0.002, "num_tokens": 28040976.0, "reward": 1.9246032238006592, "reward_std": 0.2132544219493866, "rewards/fixed_code_pass_all_test_reward/mean": 0.9246031641960144, "rewards/fixed_code_pass_all_test_reward/std": 0.2132544368505478, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.620734181885261, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.058223762549459934, "learning_rate": 1.7426347138164346e-05, "loss": 0.0023, "num_tokens": 28047712.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6209186496956282, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.028028541710227728, "learning_rate": 1.742419044058623e-05, "loss": 0.0011, "num_tokens": 28053271.0, "reward": 1.975000023841858, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 964.625, "completions/mean_terminated_length": 964.625, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 0.6211031175059952, "frac_reward_zero_std": 0.0, "grad_norm": 0.388671875, "kl": 0.02563648554496467, "learning_rate": 1.742203297331743e-05, "loss": 0.001, "num_tokens": 28073100.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6212875853163623, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03543960826937109, "learning_rate": 1.74198747365816e-05, "loss": 0.0014, "num_tokens": 28081649.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6214720531267294, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0640694797039032, "learning_rate": 1.7417715730602504e-05, "loss": 0.0026, "num_tokens": 28092163.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6216565209370964, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.0542169027030468, "learning_rate": 1.741555595560397e-05, "loss": 0.0022, "num_tokens": 28101403.0, "reward": 1.3125, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6218409887474635, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.052955642342567444, "learning_rate": 1.7413395411809907e-05, "loss": 0.0021, "num_tokens": 28107425.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6220254565578307, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.048725270200520754, "learning_rate": 1.741123409944431e-05, "loss": 0.0019, "num_tokens": 28114953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6222099243681978, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.061364800203591585, "learning_rate": 1.7409072018731245e-05, "loss": 0.0025, "num_tokens": 28125601.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 115.375, "completions/mean_terminated_length": 115.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6223943921785648, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.06109807686880231, "learning_rate": 1.7406909169894866e-05, "loss": 0.0024, "num_tokens": 28129180.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6225788599889319, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.045600482961162925, "learning_rate": 1.74047455531594e-05, "loss": 0.0018, "num_tokens": 28137506.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.622763327799299, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.05583689140621573, "learning_rate": 1.7402581168749156e-05, "loss": 0.0022, "num_tokens": 28149091.0, "reward": 1.6749999523162842, "reward_std": 0.46521881222724915, "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.46521884202957153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 228.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.6229477956096661, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.1007259413599968, "learning_rate": 1.7400416016888527e-05, "loss": 0.004, "num_tokens": 28156849.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6231322634200332, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.06840837420895696, "learning_rate": 1.7398250097801977e-05, "loss": 0.0027, "num_tokens": 28165013.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6233167312304003, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.05639171740040183, "learning_rate": 1.7396083411714057e-05, "loss": 0.0023, "num_tokens": 28172927.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 458.0, "completions/mean_terminated_length": 458.0, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6235011990407674, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.03930812678299844, "learning_rate": 1.739391595884939e-05, "loss": 0.0016, "num_tokens": 28184119.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 406.5, "completions/mean_terminated_length": 406.5, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6236856668511345, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.02575731312390417, "learning_rate": 1.7391747739432695e-05, "loss": 0.001, "num_tokens": 28192235.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6238701346615015, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.060396873159334064, "learning_rate": 1.7389578753688744e-05, "loss": 0.0024, "num_tokens": 28199758.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6240546024718686, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.06448884634301066, "learning_rate": 1.738740900184241e-05, "loss": 0.0026, "num_tokens": 28204224.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6242390702822358, "frac_reward_zero_std": 1.0, "grad_norm": 0.82421875, "kl": 0.1459603374823928, "learning_rate": 1.738523848411864e-05, "loss": 0.0058, "num_tokens": 28213373.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 338.625, "completions/mean_terminated_length": 338.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6244235380926029, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.05459106923080981, "learning_rate": 1.7383067200742454e-05, "loss": 0.0022, "num_tokens": 28222818.0, "reward": 1.6875, "reward_std": 0.45806270837783813, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6246080059029699, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.04633820476010442, "learning_rate": 1.7380895151938962e-05, "loss": 0.0019, "num_tokens": 28228475.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.624792473713337, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.05067318584769964, "learning_rate": 1.7378722337933342e-05, "loss": 0.002, "num_tokens": 28237151.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 418.0, "completions/mean_terminated_length": 418.0, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6249769415237041, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.056313605047762394, "learning_rate": 1.737654875895086e-05, "loss": 0.0023, "num_tokens": 28249191.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 440.125, "completions/mean_terminated_length": 210.42857360839844, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6251614093340712, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05960172868799418, "learning_rate": 1.737437441521686e-05, "loss": 0.0024, "num_tokens": 28255888.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 289.375, "completions/mean_terminated_length": 289.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6253458771444383, "frac_reward_zero_std": 1.0, "grad_norm": 0.057861328125, "kl": 0.060770762618631124, "learning_rate": 1.7372199306956758e-05, "loss": 0.0024, "num_tokens": 28264651.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6255303449548054, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.055230563739314675, "learning_rate": 1.7370023434396057e-05, "loss": 0.0022, "num_tokens": 28273953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6257148127651725, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.05630552442744374, "learning_rate": 1.736784679776034e-05, "loss": 0.0023, "num_tokens": 28283985.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6258992805755396, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.06325561599805951, "learning_rate": 1.7365669397275265e-05, "loss": 0.0025, "num_tokens": 28292469.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6260837483859066, "frac_reward_zero_std": 1.0, "grad_norm": 0.1611328125, "kl": 0.04284806200303137, "learning_rate": 1.7363491233166567e-05, "loss": 0.0017, "num_tokens": 28296634.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6262682161962737, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.06358213676139712, "learning_rate": 1.7361312305660064e-05, "loss": 0.0025, "num_tokens": 28306146.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 598.75, "completions/mean_terminated_length": 598.75, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.6264526840066409, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.03874512610491365, "learning_rate": 1.735913261498166e-05, "loss": 0.0015, "num_tokens": 28316856.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 232.875, "completions/mean_terminated_length": 232.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.626637151817008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.04345501773059368, "learning_rate": 1.7356952161357322e-05, "loss": 0.0017, "num_tokens": 28326335.0, "reward": 1.2857142686843872, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.626821619627375, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.05715658771805465, "learning_rate": 1.7354770945013107e-05, "loss": 0.0023, "num_tokens": 28333415.0, "reward": 1.8571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 917.875, "completions/mean_terminated_length": 917.875, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.6270060874377421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0286865234375, "kl": 0.01515455375192687, "learning_rate": 1.7352588966175156e-05, "loss": 0.0006, "num_tokens": 28349566.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 331.25, "completions/mean_terminated_length": 331.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6271905552481092, "frac_reward_zero_std": 1.0, "grad_norm": 0.2021484375, "kl": 0.07053592288866639, "learning_rate": 1.7350406225069674e-05, "loss": 0.0028, "num_tokens": 28360800.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 334.125, "completions/mean_terminated_length": 334.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6273750230584763, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.06689757923595607, "learning_rate": 1.7348222721922953e-05, "loss": 0.0027, "num_tokens": 28370057.0, "reward": 1.5, "reward_std": 0.7559289336204529, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6275594908688434, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.05433904880192131, "learning_rate": 1.734603845696137e-05, "loss": 0.0022, "num_tokens": 28373737.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6277439586792105, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.02973150776233524, "learning_rate": 1.734385343041137e-05, "loss": 0.0012, "num_tokens": 28383836.0, "reward": 1.9488189220428467, "reward_std": 0.061245452612638474, "rewards/fixed_code_pass_all_test_reward/mean": 0.9488189220428467, "rewards/fixed_code_pass_all_test_reward/std": 0.061245471239089966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 638.75, "completions/mean_terminated_length": 638.75, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.6279284264895776, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.027418581070378423, "learning_rate": 1.7341667642499485e-05, "loss": 0.0011, "num_tokens": 28398314.0, "reward": 1.192307710647583, "reward_std": 0.15924587845802307, "rewards/fixed_code_pass_all_test_reward/mean": 0.192307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.15924592316150665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6281128942999447, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.059361332561820745, "learning_rate": 1.7339481093452325e-05, "loss": 0.0024, "num_tokens": 28404325.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6282973621103117, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.05191280925646424, "learning_rate": 1.7337293783496567e-05, "loss": 0.0021, "num_tokens": 28409844.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.6284818299206788, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.05258111469447613, "learning_rate": 1.7335105712858988e-05, "loss": 0.0021, "num_tokens": 28417617.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.628666297731046, "frac_reward_zero_std": 1.0, "grad_norm": 0.26953125, "kl": 0.07505911658518016, "learning_rate": 1.7332916881766423e-05, "loss": 0.003, "num_tokens": 28423889.0, "reward": 1.1538461446762085, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1538461595773697, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 388.625, "completions/mean_terminated_length": 388.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6288507655414131, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.064131042920053, "learning_rate": 1.7330727290445806e-05, "loss": 0.0026, "num_tokens": 28431214.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6290352333517801, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.05376318749040365, "learning_rate": 1.732853693912413e-05, "loss": 0.0022, "num_tokens": 28441424.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6292197011621472, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.0251788446912542, "learning_rate": 1.7326345828028483e-05, "loss": 0.001, "num_tokens": 28451165.0, "reward": 1.5921052694320679, "reward_std": 0.43605780601501465, "rewards/fixed_code_pass_all_test_reward/mean": 0.5921052694320679, "rewards/fixed_code_pass_all_test_reward/std": 0.43605780601501465, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.6294041689725143, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.037185788969509304, "learning_rate": 1.732415395738602e-05, "loss": 0.0015, "num_tokens": 28456582.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 335.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6295886367828814, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04170013265684247, "learning_rate": 1.732196132742398e-05, "loss": 0.0017, "num_tokens": 28463765.0, "reward": 1.696428656578064, "reward_std": 0.1937432587146759, "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.1937432438135147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6297731045932484, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.0426305690780282, "learning_rate": 1.7319767938369682e-05, "loss": 0.0017, "num_tokens": 28471790.0, "reward": 1.75, "reward_std": 0.1478712558746338, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.14787118136882782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 488.5, "completions/mean_terminated_length": 488.5, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.6299575724036156, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.03625028778333217, "learning_rate": 1.731757379045052e-05, "loss": 0.0015, "num_tokens": 28481322.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6301420402139827, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06179301766678691, "learning_rate": 1.731537888389397e-05, "loss": 0.0025, "num_tokens": 28491613.0, "reward": 1.451923131942749, "reward_std": 0.8971284627914429, "rewards/fixed_code_pass_all_test_reward/mean": 0.7019230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.4352640211582184, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 431.75, "completions/mean_terminated_length": 431.75, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.6303265080243498, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.03763705282472074, "learning_rate": 1.731318321892759e-05, "loss": 0.0015, "num_tokens": 28499507.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 365.5, "completions/mean_terminated_length": 365.5, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.6305109758347168, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.04317702213302255, "learning_rate": 1.7310986795779006e-05, "loss": 0.0017, "num_tokens": 28507519.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6306954436450839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.06232546176761389, "learning_rate": 1.7308789614675926e-05, "loss": 0.0025, "num_tokens": 28518518.0, "reward": 1.1818182468414307, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.630879911455451, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.04734708252362907, "learning_rate": 1.7306591675846145e-05, "loss": 0.0019, "num_tokens": 28529506.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6310643792658182, "frac_reward_zero_std": 1.0, "grad_norm": 0.1904296875, "kl": 0.07584367087110877, "learning_rate": 1.730439297951753e-05, "loss": 0.003, "num_tokens": 28533718.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 261.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6312488470761852, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.047650488559156656, "learning_rate": 1.7302193525918026e-05, "loss": 0.0019, "num_tokens": 28540089.0, "reward": 1.732758641242981, "reward_std": 0.09353125840425491, "rewards/fixed_code_pass_all_test_reward/mean": 0.732758641242981, "rewards/fixed_code_pass_all_test_reward/std": 0.09353122860193253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6314333148865523, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.04081716435030103, "learning_rate": 1.7299993315275656e-05, "loss": 0.0016, "num_tokens": 28544560.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 324.75, "completions/mean_terminated_length": 324.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6316177826969194, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.03762569115497172, "learning_rate": 1.7297792347818525e-05, "loss": 0.0015, "num_tokens": 28551054.0, "reward": 1.5, "reward_std": 0.3831780254840851, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.3831780254840851, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6318022505072864, "frac_reward_zero_std": 0.0, "grad_norm": 17.5, "kl": 0.13949679117649794, "learning_rate": 1.7295590623774815e-05, "loss": 0.0056, "num_tokens": 28560267.0, "reward": 1.796875, "reward_std": 0.3892385959625244, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 394.0, "completions/mean_terminated_length": 394.0, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.6319867183176535, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.033099250635132194, "learning_rate": 1.7293388143372788e-05, "loss": 0.0013, "num_tokens": 28570507.0, "reward": 1.9886363744735718, "reward_std": 0.03214118629693985, "rewards/fixed_code_pass_all_test_reward/mean": 0.9886363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6321711861280207, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.07750570075586438, "learning_rate": 1.7291184906840776e-05, "loss": 0.0031, "num_tokens": 28574895.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 335.25, "completions/mean_terminated_length": 335.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6323556539383878, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.07906265323981643, "learning_rate": 1.7288980914407203e-05, "loss": 0.0032, "num_tokens": 28583545.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6325401217487548, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04161822376772761, "learning_rate": 1.728677616630056e-05, "loss": 0.0017, "num_tokens": 28593432.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 215.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6327245895591219, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.0615637784358114, "learning_rate": 1.7284570662749422e-05, "loss": 0.0025, "num_tokens": 28598650.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.632909057369489, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.056954541709274054, "learning_rate": 1.728236440398244e-05, "loss": 0.0023, "num_tokens": 28607219.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6330935251798561, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.07060477905906737, "learning_rate": 1.728015739022835e-05, "loss": 0.0028, "num_tokens": 28613516.0, "reward": 1.9821429252624512, "reward_std": 0.033064987510442734, "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.03306501731276512, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6332779929902232, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.057190278079360723, "learning_rate": 1.7277949621715953e-05, "loss": 0.0023, "num_tokens": 28617827.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 420.625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.6334624608005903, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.03602286125533283, "learning_rate": 1.7275741098674138e-05, "loss": 0.0014, "num_tokens": 28628608.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6336469286109574, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.06067067617550492, "learning_rate": 1.727353182133187e-05, "loss": 0.0024, "num_tokens": 28635000.0, "reward": 1.5843373537063599, "reward_std": 0.44436201453208923, "rewards/fixed_code_pass_all_test_reward/mean": 0.5843373537063599, "rewards/fixed_code_pass_all_test_reward/std": 0.444362074136734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6338313964213245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.056075247935950756, "learning_rate": 1.727132178991819e-05, "loss": 0.0022, "num_tokens": 28644289.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 406.625, "completions/mean_terminated_length": 406.625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.6340158642316915, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.06642380123957992, "learning_rate": 1.7269111004662226e-05, "loss": 0.0027, "num_tokens": 28655998.0, "reward": 1.3854166269302368, "reward_std": 0.2517301142215729, "rewards/fixed_code_pass_all_test_reward/mean": 0.3854166567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.2517301142215729, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 341.875, "completions/mean_terminated_length": 341.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6342003320420586, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.07010610355064273, "learning_rate": 1.7266899465793168e-05, "loss": 0.0028, "num_tokens": 28665933.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 321.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6343847998524258, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03258367581292987, "learning_rate": 1.7264687173540305e-05, "loss": 0.0013, "num_tokens": 28678188.0, "reward": 1.4789916276931763, "reward_std": 0.0950731709599495, "rewards/fixed_code_pass_all_test_reward/mean": 0.47899162769317627, "rewards/fixed_code_pass_all_test_reward/std": 0.09507319331169128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6345692676627929, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07312341034412384, "learning_rate": 1.7262474128132983e-05, "loss": 0.0029, "num_tokens": 28685942.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 421.625, "completions/mean_terminated_length": 421.625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.6347537354731599, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.04071953718084842, "learning_rate": 1.7260260329800642e-05, "loss": 0.0016, "num_tokens": 28694187.0, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.634938203283527, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.03248983109369874, "learning_rate": 1.725804577877279e-05, "loss": 0.0013, "num_tokens": 28699045.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.6351226710938941, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.06660315161570907, "learning_rate": 1.725583047527902e-05, "loss": 0.0027, "num_tokens": 28703411.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 499.625, "completions/mean_terminated_length": 499.625, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.6353071389042612, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.019686337211169302, "learning_rate": 1.7253614419548997e-05, "loss": 0.0008, "num_tokens": 28713024.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 355.0, "completions/mean_terminated_length": 355.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6354916067146283, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.023041862528771162, "learning_rate": 1.725139761181247e-05, "loss": 0.0009, "num_tokens": 28723440.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.6356760745249954, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.0500498884357512, "learning_rate": 1.7249180052299263e-05, "loss": 0.002, "num_tokens": 28731531.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.6358605423353625, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.09902601689100266, "learning_rate": 1.7246961741239273e-05, "loss": 0.004, "num_tokens": 28735701.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 417.0, "completions/mean_terminated_length": 417.0, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.6360450101457296, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.034463790827430785, "learning_rate": 1.7244742678862483e-05, "loss": 0.0014, "num_tokens": 28744029.0, "reward": 1.0110294818878174, "reward_std": 0.03119589202105999, "rewards/fixed_code_pass_all_test_reward/mean": 0.011029412038624287, "rewards/fixed_code_pass_all_test_reward/std": 0.03119588829576969, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6362294779560966, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.059244331205263734, "learning_rate": 1.724252286539895e-05, "loss": 0.0024, "num_tokens": 28749760.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 474.375, "completions/mean_terminated_length": 474.375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.6364139457664637, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.06603417498990893, "learning_rate": 1.724030230107881e-05, "loss": 0.0026, "num_tokens": 28758699.0, "reward": 1.461538553237915, "reward_std": 0.39007121324539185, "rewards/fixed_code_pass_all_test_reward/mean": 0.4615384638309479, "rewards/fixed_code_pass_all_test_reward/std": 0.39007121324539185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 228.625, "completions/mean_terminated_length": 228.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.6365984135768309, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.040396714583039284, "learning_rate": 1.723808098613228e-05, "loss": 0.0016, "num_tokens": 28766768.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 520.5, "completions/mean_terminated_length": 520.5, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.636782881387198, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.025033711805008352, "learning_rate": 1.723585892078964e-05, "loss": 0.001, "num_tokens": 28776652.0, "reward": 1.2916667461395264, "reward_std": 0.33034372329711914, "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.33034375309944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.636967349197565, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.09221207536756992, "learning_rate": 1.723363610528127e-05, "loss": 0.0037, "num_tokens": 28781187.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 313.875, "completions/mean_terminated_length": 313.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6371518170079321, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.04959116643294692, "learning_rate": 1.7231412539837616e-05, "loss": 0.002, "num_tokens": 28790194.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6373362848182992, "frac_reward_zero_std": 1.0, "grad_norm": 0.6640625, "kl": 0.09752961248159409, "learning_rate": 1.7229188224689196e-05, "loss": 0.0039, "num_tokens": 28797746.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6375207526286663, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.05241863592527807, "learning_rate": 1.722696316006662e-05, "loss": 0.0021, "num_tokens": 28804639.0, "reward": 1.892241358757019, "reward_std": 0.11575041711330414, "rewards/fixed_code_pass_all_test_reward/mean": 0.892241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.11575044691562653, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 267.75, "completions/mean_terminated_length": 267.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6377052204390334, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.04682483244687319, "learning_rate": 1.722473734620056e-05, "loss": 0.0019, "num_tokens": 28813061.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 468.375, "completions/mean_terminated_length": 468.375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.6378896882494005, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.029011288657784462, "learning_rate": 1.722251078332178e-05, "loss": 0.0012, "num_tokens": 28822136.0, "reward": 1.6607142686843872, "reward_std": 0.682720959186554, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3967800438404083, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6380741560597676, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0540568835567683, "learning_rate": 1.722028347166111e-05, "loss": 0.0022, "num_tokens": 28832957.0, "reward": 1.014423131942749, "reward_std": 0.04079460725188255, "rewards/fixed_code_pass_all_test_reward/mean": 0.014423076994717121, "rewards/fixed_code_pass_all_test_reward/std": 0.04079462215304375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6382586238701347, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.04164074675645679, "learning_rate": 1.7218055411449467e-05, "loss": 0.0017, "num_tokens": 28838874.0, "reward": 1.487069010734558, "reward_std": 0.25167182087898254, "rewards/fixed_code_pass_all_test_reward/mean": 0.48706895112991333, "rewards/fixed_code_pass_all_test_reward/std": 0.25167182087898254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 804.375, "completions/mean_terminated_length": 804.375, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.6384430916805017, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.036046633729711175, "learning_rate": 1.7215826602917838e-05, "loss": 0.0014, "num_tokens": 28855845.0, "reward": 1.8125, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 368.75, "completions/mean_terminated_length": 368.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6386275594908688, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.06934052752330899, "learning_rate": 1.7213597046297295e-05, "loss": 0.0028, "num_tokens": 28865147.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.638812027301236, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.05190222733654082, "learning_rate": 1.7211366741818976e-05, "loss": 0.0021, "num_tokens": 28872589.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6389964951116031, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.02695427555590868, "learning_rate": 1.7209135689714114e-05, "loss": 0.0011, "num_tokens": 28878271.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6391809629219701, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.03707165876403451, "learning_rate": 1.7206903890214007e-05, "loss": 0.0015, "num_tokens": 28885210.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6393654307323372, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.07017428148537874, "learning_rate": 1.720467134355003e-05, "loss": 0.0028, "num_tokens": 28889133.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6395498985427043, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.03562236996367574, "learning_rate": 1.720243804995364e-05, "loss": 0.0014, "num_tokens": 28893165.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6397343663530713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.031986344954930246, "learning_rate": 1.7200204009656368e-05, "loss": 0.0013, "num_tokens": 28901670.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 372.75, "completions/mean_terminated_length": 372.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6399188341634385, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04303280997555703, "learning_rate": 1.7197969222889827e-05, "loss": 0.0017, "num_tokens": 28911052.0, "reward": 1.3830645084381104, "reward_std": 0.5110456943511963, "rewards/fixed_code_pass_all_test_reward/mean": 0.5080645084381104, "rewards/fixed_code_pass_all_test_reward/std": 0.5260425209999084, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6401033019738056, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.039954389445483685, "learning_rate": 1.719573368988571e-05, "loss": 0.0016, "num_tokens": 28918336.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6402877697841727, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.06670100055634975, "learning_rate": 1.7193497410875766e-05, "loss": 0.0027, "num_tokens": 28923132.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 425.75, "completions/mean_terminated_length": 425.75, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6404722375945398, "frac_reward_zero_std": 1.0, "grad_norm": 0.05126953125, "kl": 0.03627040924038738, "learning_rate": 1.7191260386091857e-05, "loss": 0.0015, "num_tokens": 28931386.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6406567054049068, "frac_reward_zero_std": 1.0, "grad_norm": 0.34765625, "kl": 0.044685350148938596, "learning_rate": 1.718902261576589e-05, "loss": 0.0018, "num_tokens": 28935457.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 461.75, "completions/mean_terminated_length": 461.75, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.6408411732152739, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03351482772268355, "learning_rate": 1.7186784100129864e-05, "loss": 0.0013, "num_tokens": 28944111.0, "reward": 1.7946429252624512, "reward_std": 0.3920446038246155, "rewards/fixed_code_pass_all_test_reward/mean": 0.7946428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3920446038246155, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 352.125, "completions/mean_terminated_length": 352.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6410256410256411, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.03916339739225805, "learning_rate": 1.718454483941586e-05, "loss": 0.0016, "num_tokens": 28953096.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 317.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6412101088360082, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0762798935174942, "learning_rate": 1.7182304833856026e-05, "loss": 0.0031, "num_tokens": 28959700.0, "reward": 1.771505355834961, "reward_std": 0.4230898320674896, "rewards/fixed_code_pass_all_test_reward/mean": 0.7715053558349609, "rewards/fixed_code_pass_all_test_reward/std": 0.4230898320674896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.6413945766463752, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.06368000456131995, "learning_rate": 1.7180064083682583e-05, "loss": 0.0025, "num_tokens": 28963547.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6415790444567423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.029705253487918526, "learning_rate": 1.717782258912785e-05, "loss": 0.0012, "num_tokens": 28974716.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 545.5, "completions/mean_terminated_length": 330.8571472167969, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.6417635122671094, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.14663005527108908, "learning_rate": 1.7175580350424202e-05, "loss": 0.0059, "num_tokens": 28981944.0, "reward": 0.875, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.6419479800774764, "frac_reward_zero_std": 1.0, "grad_norm": 0.033935546875, "kl": 0.013653967762365937, "learning_rate": 1.7173337367804103e-05, "loss": 0.0005, "num_tokens": 28988805.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.6421324478878435, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.046565545722842216, "learning_rate": 1.7171093641500087e-05, "loss": 0.0019, "num_tokens": 28998150.0, "reward": 1.25, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 470.375, "completions/mean_terminated_length": 470.375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6423169156982107, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.04831159207969904, "learning_rate": 1.716884917174477e-05, "loss": 0.0019, "num_tokens": 29009161.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.6425013835085778, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.05346830980852246, "learning_rate": 1.7166603958770848e-05, "loss": 0.0021, "num_tokens": 29018471.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6426858513189448, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.06737195048481226, "learning_rate": 1.716435800281109e-05, "loss": 0.0027, "num_tokens": 29023989.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6428703191293119, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.06831734557636082, "learning_rate": 1.716211130409833e-05, "loss": 0.0027, "num_tokens": 29028441.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.643054786939679, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.13315195124596357, "learning_rate": 1.71598638628655e-05, "loss": 0.0053, "num_tokens": 29032595.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6432392547500461, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.021956480981316417, "learning_rate": 1.7157615679345602e-05, "loss": 0.0009, "num_tokens": 29039217.0, "reward": 1.7374999523162842, "reward_std": 0.25599944591522217, "rewards/fixed_code_pass_all_test_reward/mean": 0.737500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.25599944591522217, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6434237225604132, "frac_reward_zero_std": 0.0, "grad_norm": 2.765625, "kl": 0.2908115000464022, "learning_rate": 1.7155366753771708e-05, "loss": 0.0116, "num_tokens": 29047842.0, "reward": 1.125, "reward_std": 0.05050764977931976, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 172.375, "completions/mean_terminated_length": 172.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6436081903707803, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.07113793632015586, "learning_rate": 1.7153117086376974e-05, "loss": 0.0028, "num_tokens": 29052141.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6437926581811474, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.047252022195607424, "learning_rate": 1.715086667739463e-05, "loss": 0.0019, "num_tokens": 29061671.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 429.375, "completions/mean_terminated_length": 429.375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6439771259915145, "frac_reward_zero_std": 1.0, "grad_norm": 0.047607421875, "kl": 0.04195108765270561, "learning_rate": 1.714861552705798e-05, "loss": 0.0017, "num_tokens": 29072754.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 601.875, "completions/mean_terminated_length": 601.875, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.6441615938018815, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.028670434840023518, "learning_rate": 1.7146363635600413e-05, "loss": 0.0011, "num_tokens": 29083465.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6443460616122486, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.04713781480677426, "learning_rate": 1.7144111003255386e-05, "loss": 0.0019, "num_tokens": 29088323.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 526.5, "completions/mean_terminated_length": 526.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.6445305294226158, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.06541133904829621, "learning_rate": 1.7141857630256442e-05, "loss": 0.0026, "num_tokens": 29099399.0, "reward": 1.658046007156372, "reward_std": 0.25195762515068054, "rewards/fixed_code_pass_all_test_reward/mean": 0.6580460071563721, "rewards/fixed_code_pass_all_test_reward/std": 0.25195759534835815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6447149972329829, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.08131601242348552, "learning_rate": 1.7139603516837193e-05, "loss": 0.0033, "num_tokens": 29103439.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6448994650433499, "frac_reward_zero_std": 1.0, "grad_norm": 0.033935546875, "kl": 0.014463061816059053, "learning_rate": 1.713734866323133e-05, "loss": 0.0006, "num_tokens": 29110348.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.645083932853717, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.045549745904281735, "learning_rate": 1.7135093069672623e-05, "loss": 0.0018, "num_tokens": 29120064.0, "reward": 1.8081896305084229, "reward_std": 0.23103035986423492, "rewards/fixed_code_pass_all_test_reward/mean": 0.8081896305084229, "rewards/fixed_code_pass_all_test_reward/std": 0.23103035986423492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 550.75, "completions/mean_terminated_length": 550.75, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.6452684006640841, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.034454753156751394, "learning_rate": 1.713283673639491e-05, "loss": 0.0014, "num_tokens": 29129782.0, "reward": 1.4722222089767456, "reward_std": 0.051434475928545, "rewards/fixed_code_pass_all_test_reward/mean": 0.4722222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.051434461027383804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 710.25, "completions/mean_terminated_length": 710.25, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.6454528684744512, "frac_reward_zero_std": 0.0, "grad_norm": 0.6796875, "kl": 0.03268989152275026, "learning_rate": 1.7130579663632124e-05, "loss": 0.0013, "num_tokens": 29145472.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6456373362848183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.026255927921738476, "learning_rate": 1.7128321851618256e-05, "loss": 0.0011, "num_tokens": 29150602.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.6458218040951854, "frac_reward_zero_std": 1.0, "grad_norm": 0.75, "kl": 0.10595285682938993, "learning_rate": 1.7126063300587382e-05, "loss": 0.0042, "num_tokens": 29154490.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 393.75, "completions/mean_terminated_length": 393.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6460062719055525, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.027833778643980622, "learning_rate": 1.7123804010773657e-05, "loss": 0.0011, "num_tokens": 29162976.0, "reward": 1.9249999523162842, "reward_std": 0.2121320217847824, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6461907397159196, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.042864460963755846, "learning_rate": 1.7121543982411302e-05, "loss": 0.0017, "num_tokens": 29167594.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6463752075262866, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.03370582510251552, "learning_rate": 1.711928321573463e-05, "loss": 0.0013, "num_tokens": 29171972.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6465596753366537, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.05031390301883221, "learning_rate": 1.7117021710978016e-05, "loss": 0.002, "num_tokens": 29178613.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.6467441431470209, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.13079465553164482, "learning_rate": 1.711475946837592e-05, "loss": 0.0052, "num_tokens": 29185806.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.646928610957388, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.04557510884478688, "learning_rate": 1.711249648816288e-05, "loss": 0.0018, "num_tokens": 29193920.0, "reward": 1.625, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 412.625, "completions/mean_terminated_length": 412.625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.647113078767755, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.0454241493716836, "learning_rate": 1.7110232770573497e-05, "loss": 0.0018, "num_tokens": 29205317.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6472975465781221, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.061412982642650604, "learning_rate": 1.7107968315842467e-05, "loss": 0.0025, "num_tokens": 29211798.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6474820143884892, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.059985117986798286, "learning_rate": 1.710570312420455e-05, "loss": 0.0024, "num_tokens": 29219981.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6476664821988563, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04265649407170713, "learning_rate": 1.7103437195894588e-05, "loss": 0.0017, "num_tokens": 29225572.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 298.125, "completions/mean_terminated_length": 298.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6478509500092234, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.05274852621369064, "learning_rate": 1.7101170531147496e-05, "loss": 0.0021, "num_tokens": 29232037.0, "reward": 1.0500000715255737, "reward_std": 0.4375254809856415, "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.12817399203777313, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 391.0, "completions/mean_terminated_length": 391.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6480354178195905, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.04025741631630808, "learning_rate": 1.709890313019827e-05, "loss": 0.0016, "num_tokens": 29242501.0, "reward": 1.7000000476837158, "reward_std": 0.32071352005004883, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.32071349024772644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 512.25, "completions/mean_terminated_length": 512.25, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.6482198856299576, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.044669343158602715, "learning_rate": 1.7096634993281972e-05, "loss": 0.0018, "num_tokens": 29252159.0, "reward": 1.3958333730697632, "reward_std": 0.12400396168231964, "rewards/fixed_code_pass_all_test_reward/mean": 0.3958333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.12400396913290024, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6484043534403247, "frac_reward_zero_std": 1.0, "grad_norm": 0.35546875, "kl": 0.059085462940856814, "learning_rate": 1.709436612063375e-05, "loss": 0.0024, "num_tokens": 29256631.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6485888212506917, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05148002109490335, "learning_rate": 1.709209651248883e-05, "loss": 0.0021, "num_tokens": 29266018.0, "reward": 1.5478723049163818, "reward_std": 0.7314721345901489, "rewards/fixed_code_pass_all_test_reward/mean": 0.6728723049163818, "rewards/fixed_code_pass_all_test_reward/std": 0.46669283509254456, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6487732890610588, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.031050573103129864, "learning_rate": 1.7089826169082506e-05, "loss": 0.0012, "num_tokens": 29270731.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 512.625, "completions/mean_terminated_length": 512.625, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.648957756871426, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.05682907020673156, "learning_rate": 1.7087555090650153e-05, "loss": 0.0023, "num_tokens": 29279624.0, "reward": 1.6800000667572021, "reward_std": 0.2892354428768158, "rewards/fixed_code_pass_all_test_reward/mean": 0.6800000071525574, "rewards/fixed_code_pass_all_test_reward/std": 0.2892354726791382, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 392.25, "completions/mean_terminated_length": 392.25, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6491422246817931, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.03782501083333045, "learning_rate": 1.708528327742722e-05, "loss": 0.0015, "num_tokens": 29289570.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6493266924921601, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.04337457357905805, "learning_rate": 1.7083010729649237e-05, "loss": 0.0017, "num_tokens": 29298080.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6495111603025272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.03730009822174907, "learning_rate": 1.7080737447551804e-05, "loss": 0.0015, "num_tokens": 29302803.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 490.5, "completions/mean_terminated_length": 490.5, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6496956281128943, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.0712610399350524, "learning_rate": 1.7078463431370598e-05, "loss": 0.0029, "num_tokens": 29313143.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 440.75, "completions/mean_terminated_length": 440.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.6498800959232613, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.044433149974793196, "learning_rate": 1.7076188681341378e-05, "loss": 0.0018, "num_tokens": 29324869.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 278.25, "completions/mean_terminated_length": 278.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.6500645637336285, "frac_reward_zero_std": 1.0, "grad_norm": 0.056396484375, "kl": 0.03620890202000737, "learning_rate": 1.707391319769997e-05, "loss": 0.0014, "num_tokens": 29331767.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6502490315439956, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.02555637143086642, "learning_rate": 1.7071636980682288e-05, "loss": 0.001, "num_tokens": 29336134.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 329.0, "completions/mean_terminated_length": 329.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6504334993543627, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.0402876086300239, "learning_rate": 1.706936003052431e-05, "loss": 0.0016, "num_tokens": 29345094.0, "reward": 1.0083333253860474, "reward_std": 0.0235702283680439, "rewards/fixed_code_pass_all_test_reward/mean": 0.008333333767950535, "rewards/fixed_code_pass_all_test_reward/std": 0.0235702283680439, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.6506179671647297, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.044863255927339196, "learning_rate": 1.7067082347462095e-05, "loss": 0.0018, "num_tokens": 29353947.0, "reward": 1.567307710647583, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 653.125, "completions/mean_terminated_length": 653.125, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.6508024349750968, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.027257801266387105, "learning_rate": 1.7064803931731778e-05, "loss": 0.0011, "num_tokens": 29369188.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 461.25, "completions/mean_terminated_length": 461.25, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.6509869027854639, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.05260556470602751, "learning_rate": 1.7062524783569573e-05, "loss": 0.0021, "num_tokens": 29381398.0, "reward": 1.5, "reward_std": 0.5629958510398865, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6511713705958311, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.055208828300237656, "learning_rate": 1.7060244903211764e-05, "loss": 0.0022, "num_tokens": 29388033.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 389.125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6513558384061982, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.1010616400744766, "learning_rate": 1.705796429089472e-05, "loss": 0.004, "num_tokens": 29395634.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6515403062165652, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.05227800470311195, "learning_rate": 1.705568294685487e-05, "loss": 0.0021, "num_tokens": 29400273.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 458.875, "completions/mean_terminated_length": 458.875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6517247740269323, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.02697537071071565, "learning_rate": 1.705340087132873e-05, "loss": 0.0011, "num_tokens": 29408688.0, "reward": 1.7272727489471436, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 646.375, "completions/mean_terminated_length": 646.375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.6519092418372994, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.043093855725601315, "learning_rate": 1.70511180645529e-05, "loss": 0.0017, "num_tokens": 29420851.0, "reward": 1.8181817531585693, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6520937096476664, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.01608935120748356, "learning_rate": 1.7048834526764035e-05, "loss": 0.0006, "num_tokens": 29426723.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3535533845424652, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6522781774580336, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.05386209720745683, "learning_rate": 1.7046550258198885e-05, "loss": 0.0022, "num_tokens": 29437063.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 533.75, "completions/mean_terminated_length": 533.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6524626452684007, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.04106337716802955, "learning_rate": 1.7044265259094263e-05, "loss": 0.0016, "num_tokens": 29448269.0, "reward": 1.29296875, "reward_std": 0.452219694852829, "rewards/fixed_code_pass_all_test_reward/mean": 0.29296875, "rewards/fixed_code_pass_all_test_reward/std": 0.452219694852829, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6526471130787678, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.028331871260888875, "learning_rate": 1.7041979529687063e-05, "loss": 0.0011, "num_tokens": 29457167.0, "reward": 1.851190447807312, "reward_std": 0.12322601675987244, "rewards/fixed_code_pass_all_test_reward/mean": 0.851190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.12322598695755005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 780.125, "completions/mean_terminated_length": 780.125, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.6528315808891348, "frac_reward_zero_std": 0.0, "grad_norm": 0.55859375, "kl": 0.02169166275416501, "learning_rate": 1.7039693070214257e-05, "loss": 0.0009, "num_tokens": 29478680.0, "reward": 1.8709677457809448, "reward_std": 0.35210511088371277, "rewards/fixed_code_pass_all_test_reward/mean": 0.8709677457809448, "rewards/fixed_code_pass_all_test_reward/std": 0.35210511088371277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6530160486995019, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.040681323735043406, "learning_rate": 1.7037405880912887e-05, "loss": 0.0016, "num_tokens": 29482788.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 474.625, "completions/mean_terminated_length": 474.625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.653200516509869, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.03449256962630898, "learning_rate": 1.7035117962020074e-05, "loss": 0.0014, "num_tokens": 29493545.0, "reward": 1.9621212482452393, "reward_std": 0.10713736712932587, "rewards/fixed_code_pass_all_test_reward/mean": 0.9621212482452393, "rewards/fixed_code_pass_all_test_reward/std": 0.10713739693164825, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6533849843202362, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.044530095998197794, "learning_rate": 1.7032829313773018e-05, "loss": 0.0018, "num_tokens": 29500202.0, "reward": 1.389423131942749, "reward_std": 0.15982544422149658, "rewards/fixed_code_pass_all_test_reward/mean": 0.38942307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.1598254293203354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 403.75, "completions/mean_terminated_length": 403.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.6535694521306032, "frac_reward_zero_std": 1.0, "grad_norm": 0.2890625, "kl": 0.08502795756794512, "learning_rate": 1.7030539936408986e-05, "loss": 0.0034, "num_tokens": 29509808.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 595.0, "completions/mean_terminated_length": 387.4285888671875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6537539199409703, "frac_reward_zero_std": 0.0, "grad_norm": 0.498046875, "kl": 0.023293629987165332, "learning_rate": 1.7028249830165328e-05, "loss": 0.0009, "num_tokens": 29517872.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 494.25, "completions/mean_terminated_length": 494.25, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6539383877513374, "frac_reward_zero_std": 1.0, "grad_norm": 0.28125, "kl": 0.07382483687251806, "learning_rate": 1.7025958995279466e-05, "loss": 0.003, "num_tokens": 29530434.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 535.875, "completions/mean_terminated_length": 535.875, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.6541228555617045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0224609375, "kl": 0.009964551223674789, "learning_rate": 1.70236674319889e-05, "loss": 0.0004, "num_tokens": 29539689.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6543073233720715, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.0392616440076381, "learning_rate": 1.7021375140531203e-05, "loss": 0.0016, "num_tokens": 29544960.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.6544917911824386, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.08010514662601054, "learning_rate": 1.7019082121144023e-05, "loss": 0.0032, "num_tokens": 29548888.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 421.25, "completions/mean_terminated_length": 421.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.6546762589928058, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.04015236138366163, "learning_rate": 1.7016788374065084e-05, "loss": 0.0016, "num_tokens": 29560538.0, "reward": 1.9861111640930176, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6548607268031729, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.03754666610620916, "learning_rate": 1.701449389953219e-05, "loss": 0.0015, "num_tokens": 29564600.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 451.5, "completions/mean_terminated_length": 451.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.6550451946135399, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.04572094860486686, "learning_rate": 1.701219869778322e-05, "loss": 0.0018, "num_tokens": 29572308.0, "reward": 1.0367647409439087, "reward_std": 0.3572911322116852, "rewards/fixed_code_pass_all_test_reward/mean": 0.1617647111415863, "rewards/fixed_code_pass_all_test_reward/std": 0.08281682431697845, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.655229662423907, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.050917466869577765, "learning_rate": 1.7009902769056117e-05, "loss": 0.002, "num_tokens": 29578393.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6554141302342741, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.04505270323716104, "learning_rate": 1.700760611358891e-05, "loss": 0.0018, "num_tokens": 29583152.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6555985980446412, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.031955579994246364, "learning_rate": 1.7005308731619707e-05, "loss": 0.0013, "num_tokens": 29588707.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 426.5, "completions/mean_terminated_length": 426.5, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6557830658550083, "frac_reward_zero_std": 0.0, "grad_norm": 0.76953125, "kl": 0.03818586328998208, "learning_rate": 1.7003010623386678e-05, "loss": 0.0015, "num_tokens": 29599247.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 314.875, "completions/mean_terminated_length": 314.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6559675336653754, "frac_reward_zero_std": 1.0, "grad_norm": 0.038330078125, "kl": 0.029397392878308892, "learning_rate": 1.7000711789128082e-05, "loss": 0.0012, "num_tokens": 29609102.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6561520014757425, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.048595622880384326, "learning_rate": 1.699841222908224e-05, "loss": 0.0019, "num_tokens": 29618058.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 611.875, "completions/mean_terminated_length": 611.875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.6563364692861096, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.030419808928854764, "learning_rate": 1.6996111943487555e-05, "loss": 0.0012, "num_tokens": 29633521.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 355.875, "completions/mean_terminated_length": 355.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6565209370964766, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.055334446020424366, "learning_rate": 1.6993810932582513e-05, "loss": 0.0022, "num_tokens": 29643472.0, "reward": 1.788690447807312, "reward_std": 0.308787077665329, "rewards/fixed_code_pass_all_test_reward/mean": 0.788690447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.308787077665329, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 650.0, "completions/mean_terminated_length": 650.0, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.6567054049068437, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.04806937696412206, "learning_rate": 1.6991509196605664e-05, "loss": 0.0019, "num_tokens": 29657144.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.6568898727172109, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.05940876272507012, "learning_rate": 1.6989206735795634e-05, "loss": 0.0024, "num_tokens": 29666199.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 552.25, "completions/mean_terminated_length": 552.25, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.657074340527578, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.040676050120964646, "learning_rate": 1.698690355039113e-05, "loss": 0.0016, "num_tokens": 29682265.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 478.25, "completions/mean_terminated_length": 478.25, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.657258808337945, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.04319406393915415, "learning_rate": 1.6984599640630927e-05, "loss": 0.0017, "num_tokens": 29691235.0, "reward": 1.9812500476837158, "reward_std": 0.0530330166220665, "rewards/fixed_code_pass_all_test_reward/mean": 0.981249988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.053033001720905304, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 403.25, "completions/mean_terminated_length": 403.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6574432761483121, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.014921379159204662, "learning_rate": 1.6982295006753883e-05, "loss": 0.0006, "num_tokens": 29698837.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6576277439586792, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.07079661986790597, "learning_rate": 1.6979989648998928e-05, "loss": 0.0028, "num_tokens": 29706971.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 389.5, "completions/mean_terminated_length": 389.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.6578122117690463, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.03142881614621729, "learning_rate": 1.697768356760506e-05, "loss": 0.0013, "num_tokens": 29714487.0, "reward": 1.3318965435028076, "reward_std": 0.0853404551744461, "rewards/fixed_code_pass_all_test_reward/mean": 0.3318965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.0853404700756073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 380.125, "completions/mean_terminated_length": 380.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.6579966795794134, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.05338608706369996, "learning_rate": 1.6975376762811365e-05, "loss": 0.0021, "num_tokens": 29726440.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 487.0, "completions/mean_terminated_length": 487.0, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.6581811473897805, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.03242480428889394, "learning_rate": 1.6973069234856995e-05, "loss": 0.0013, "num_tokens": 29738104.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6583656152001476, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.04973982577212155, "learning_rate": 1.6970760983981174e-05, "loss": 0.002, "num_tokens": 29745925.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.6585500830105147, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.04524690145626664, "learning_rate": 1.6968452010423212e-05, "loss": 0.0018, "num_tokens": 29750124.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 441.625, "completions/mean_terminated_length": 441.625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.6587345508208817, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.029386924114078283, "learning_rate": 1.6966142314422487e-05, "loss": 0.0012, "num_tokens": 29761337.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.6589190186312488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.041866542655043304, "learning_rate": 1.6963831896218453e-05, "loss": 0.0017, "num_tokens": 29765498.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 526.375, "completions/mean_terminated_length": 526.375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.659103486441616, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.02973798383027315, "learning_rate": 1.6961520756050643e-05, "loss": 0.0012, "num_tokens": 29774941.0, "reward": 1.5, "reward_std": 0.076360322535038, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.07636036723852158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.659287954251983, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.050248213578015566, "learning_rate": 1.695920889415865e-05, "loss": 0.002, "num_tokens": 29783883.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 366.0, "completions/mean_terminated_length": 366.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6594724220623501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.04503906494937837, "learning_rate": 1.6956896310782158e-05, "loss": 0.0018, "num_tokens": 29792155.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 263.875, "completions/mean_terminated_length": 263.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6596568898727172, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.04810227151028812, "learning_rate": 1.6954583006160923e-05, "loss": 0.0019, "num_tokens": 29797442.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 399.625, "completions/mean_terminated_length": 399.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6598413576830843, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.03518701670691371, "learning_rate": 1.695226898053477e-05, "loss": 0.0014, "num_tokens": 29808303.0, "reward": 1.633802890777588, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6338028311729431, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 363.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6600258254934513, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.043448704993352294, "learning_rate": 1.6949954234143603e-05, "loss": 0.0017, "num_tokens": 29817071.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 397.25, "completions/mean_terminated_length": 397.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6602102933038185, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.07032757764682174, "learning_rate": 1.6947638767227398e-05, "loss": 0.0028, "num_tokens": 29827089.0, "reward": 1.5749999284744263, "reward_std": 0.6363961100578308, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 398.5, "completions/mean_terminated_length": 398.5, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.6603947611141856, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.040227144258096814, "learning_rate": 1.694532258002621e-05, "loss": 0.0016, "num_tokens": 29838677.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6605792289245527, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.04096542880870402, "learning_rate": 1.6943005672780164e-05, "loss": 0.0016, "num_tokens": 29843481.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 772.625, "completions/mean_terminated_length": 772.625, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.6607636967349197, "frac_reward_zero_std": 0.0, "grad_norm": 0.6640625, "kl": 0.03150696854572743, "learning_rate": 1.6940688045729458e-05, "loss": 0.0013, "num_tokens": 29862262.0, "reward": 1.5714285373687744, "reward_std": 0.4948716461658478, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.49487167596817017, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 560.0, "completions/mean_terminated_length": 560.0, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.6609481645452868, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.03657388058491051, "learning_rate": 1.6938369699114376e-05, "loss": 0.0015, "num_tokens": 29874310.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 469.375, "completions/mean_terminated_length": 469.375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.6611326323556539, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.054257039446383715, "learning_rate": 1.6936050633175263e-05, "loss": 0.0022, "num_tokens": 29886097.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6613171001660211, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.06538175465539098, "learning_rate": 1.6933730848152544e-05, "loss": 0.0026, "num_tokens": 29892299.0, "reward": 1.671875, "reward_std": 0.3531585931777954, "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, "rewards/fixed_code_pass_all_test_reward/std": 0.3531585931777954, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6615015679763881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.07822249410673976, "learning_rate": 1.6931410344286722e-05, "loss": 0.0031, "num_tokens": 29897880.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 731.875, "completions/mean_terminated_length": 543.857177734375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6616860357867552, "frac_reward_zero_std": 0.0, "grad_norm": 0.38671875, "kl": 0.018130575361283263, "learning_rate": 1.6929089121818375e-05, "loss": 0.0007, "num_tokens": 29907247.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 404.625, "completions/mean_terminated_length": 404.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6618705035971223, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.05864599673077464, "learning_rate": 1.692676718098814e-05, "loss": 0.0023, "num_tokens": 29917476.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 278.25, "completions/mean_terminated_length": 278.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6620549714074894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.027187709463760257, "learning_rate": 1.692444452203675e-05, "loss": 0.0011, "num_tokens": 29922454.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6622394392178564, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0859684725292027, "learning_rate": 1.6922121145205e-05, "loss": 0.0034, "num_tokens": 29933687.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 463.875, "completions/mean_terminated_length": 463.875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.6624239070282236, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.03686350304633379, "learning_rate": 1.6919797050733765e-05, "loss": 0.0015, "num_tokens": 29942454.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6626083748385907, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.06840147660113871, "learning_rate": 1.6917472238863988e-05, "loss": 0.0027, "num_tokens": 29952088.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6627928426489578, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.058992582373321056, "learning_rate": 1.691514670983669e-05, "loss": 0.0024, "num_tokens": 29957500.0, "reward": 1.6749999523162842, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6629773104593248, "frac_reward_zero_std": 1.0, "grad_norm": 0.36328125, "kl": 0.05894047557376325, "learning_rate": 1.691282046389297e-05, "loss": 0.0024, "num_tokens": 29964487.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 496.875, "completions/mean_terminated_length": 496.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6631617782696919, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.024673782143509015, "learning_rate": 1.691049350127399e-05, "loss": 0.001, "num_tokens": 29973606.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.663346246080059, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.09928623866289854, "learning_rate": 1.6908165822221004e-05, "loss": 0.004, "num_tokens": 29978474.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 530.25, "completions/mean_terminated_length": 530.25, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.6635307138904262, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.04722412442788482, "learning_rate": 1.6905837426975324e-05, "loss": 0.0019, "num_tokens": 29990364.0, "reward": 1.0520832538604736, "reward_std": 0.14731387794017792, "rewards/fixed_code_pass_all_test_reward/mean": 0.0520833320915699, "rewards/fixed_code_pass_all_test_reward/std": 0.1473139077425003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 440.375, "completions/mean_terminated_length": 440.375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.6637151817007932, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.029176080133765936, "learning_rate": 1.6903508315778342e-05, "loss": 0.0012, "num_tokens": 29998839.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6638996495111603, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.06269190064631402, "learning_rate": 1.690117848887153e-05, "loss": 0.0025, "num_tokens": 30005440.0, "reward": 1.6126374006271362, "reward_std": 0.26735803484916687, "rewards/fixed_code_pass_all_test_reward/mean": 0.6126374006271362, "rewards/fixed_code_pass_all_test_reward/std": 0.26735803484916687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 420.625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6640841173215274, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.034976087510585785, "learning_rate": 1.6898847946496428e-05, "loss": 0.0014, "num_tokens": 30018597.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6642685851318945, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.05378284677863121, "learning_rate": 1.6896516688894648e-05, "loss": 0.0022, "num_tokens": 30027829.0, "reward": 1.3583333492279053, "reward_std": 0.28270238637924194, "rewards/fixed_code_pass_all_test_reward/mean": 0.3583333492279053, "rewards/fixed_code_pass_all_test_reward/std": 0.28270238637924194, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 353.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6644530529422615, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.04154221131466329, "learning_rate": 1.6894184716307877e-05, "loss": 0.0017, "num_tokens": 30034581.0, "reward": 1.5723683834075928, "reward_std": 0.13473652303218842, "rewards/fixed_code_pass_all_test_reward/mean": 0.5723683834075928, "rewards/fixed_code_pass_all_test_reward/std": 0.1347365379333496, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6646375207526287, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.05066532548516989, "learning_rate": 1.6891852028977883e-05, "loss": 0.002, "num_tokens": 30044308.0, "reward": 1.2904412746429443, "reward_std": 0.15872822701931, "rewards/fixed_code_pass_all_test_reward/mean": 0.2904411852359772, "rewards/fixed_code_pass_all_test_reward/std": 0.1587281972169876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 717.0, "completions/mean_terminated_length": 717.0, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.6648219885629958, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.02740553067997098, "learning_rate": 1.6889518627146505e-05, "loss": 0.0011, "num_tokens": 30062468.0, "reward": 1.3125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 94.5, "completions/mean_terminated_length": 94.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6650064563733629, "frac_reward_zero_std": 0.0, "grad_norm": 3.15625, "kl": 0.09422318125143647, "learning_rate": 1.6887184511055648e-05, "loss": 0.0038, "num_tokens": 30066112.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6651909241837299, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.056179068284109235, "learning_rate": 1.68848496809473e-05, "loss": 0.0022, "num_tokens": 30070548.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 460.375, "completions/mean_terminated_length": 460.375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.665375391994097, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.07556358119472861, "learning_rate": 1.6882514137063525e-05, "loss": 0.003, "num_tokens": 30080423.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 501.375, "completions/mean_terminated_length": 501.375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.6655598598044641, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.029042242909781635, "learning_rate": 1.688017787964646e-05, "loss": 0.0012, "num_tokens": 30088722.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6657443276148313, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.05258803511969745, "learning_rate": 1.6877840908938295e-05, "loss": 0.0021, "num_tokens": 30093468.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6659287954251983, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.03798463998828083, "learning_rate": 1.687550322518133e-05, "loss": 0.0015, "num_tokens": 30098213.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 403.375, "completions/mean_terminated_length": 403.375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.6661132632355654, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.04534350661560893, "learning_rate": 1.687316482861791e-05, "loss": 0.0018, "num_tokens": 30106856.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6662977310459325, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.04544827644713223, "learning_rate": 1.6870825719490473e-05, "loss": 0.0018, "num_tokens": 30116056.0, "reward": 1.9956896305084229, "reward_std": 0.01219149399548769, "rewards/fixed_code_pass_all_test_reward/mean": 0.9956896305084229, "rewards/fixed_code_pass_all_test_reward/std": 0.012191502377390862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6664821988562996, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.05729085230268538, "learning_rate": 1.6868485898041514e-05, "loss": 0.0023, "num_tokens": 30120457.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.240234375, "kl": 0.12405193597078323, "learning_rate": 1.6866145364513613e-05, "loss": 0.005, "num_tokens": 30124758.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.6668511344770337, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.04399727890267968, "learning_rate": 1.686380411914942e-05, "loss": 0.0018, "num_tokens": 30129128.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6670356022874009, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.03026028093881905, "learning_rate": 1.6861462162191665e-05, "loss": 0.0012, "num_tokens": 30134864.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 518.25, "completions/mean_terminated_length": 518.25, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.667220070097768, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.05629445053637028, "learning_rate": 1.6859119493883137e-05, "loss": 0.0023, "num_tokens": 30144266.0, "reward": 1.4500000476837158, "reward_std": 0.4869731366634369, "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, "rewards/fixed_code_pass_all_test_reward/std": 0.4869731664657593, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.667404537908135, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.021892332122661173, "learning_rate": 1.6856776114466717e-05, "loss": 0.0009, "num_tokens": 30149304.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6675890057185021, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.03007274738047272, "learning_rate": 1.685443202418535e-05, "loss": 0.0012, "num_tokens": 30154178.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 506.125, "completions/mean_terminated_length": 506.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.6677734735288692, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.024745488539338112, "learning_rate": 1.685208722328205e-05, "loss": 0.001, "num_tokens": 30163275.0, "reward": 1.1750000715255737, "reward_std": 0.0707106813788414, "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 368.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.6679579413392362, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.050702113658189774, "learning_rate": 1.6849741711999915e-05, "loss": 0.002, "num_tokens": 30170707.0, "reward": 1.692307710647583, "reward_std": 0.43319571018218994, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.43319568037986755, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 713.375, "completions/mean_terminated_length": 713.375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.6681424091496034, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.03704624925740063, "learning_rate": 1.684739549058211e-05, "loss": 0.0015, "num_tokens": 30183262.0, "reward": 1.6458332538604736, "reward_std": 0.49149513244628906, "rewards/fixed_code_pass_all_test_reward/mean": 0.6458333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.49149513244628906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6683268769599705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.023277232074178755, "learning_rate": 1.684504855927188e-05, "loss": 0.0009, "num_tokens": 30189168.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 356.5, "completions/mean_terminated_length": 356.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6685113447703376, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.030712550156749785, "learning_rate": 1.6842700918312532e-05, "loss": 0.0012, "num_tokens": 30195588.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 306.125, "completions/mean_terminated_length": 306.125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6686958125807047, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.06425913004204631, "learning_rate": 1.6840352567947457e-05, "loss": 0.0026, "num_tokens": 30204101.0, "reward": 1.84375, "reward_std": 0.35197147727012634, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6688802803910717, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.026010525063611567, "learning_rate": 1.6838003508420117e-05, "loss": 0.001, "num_tokens": 30208712.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6690647482014388, "frac_reward_zero_std": 1.0, "grad_norm": 0.061767578125, "kl": 0.050524536054581404, "learning_rate": 1.683565373997405e-05, "loss": 0.002, "num_tokens": 30216885.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.669249216011806, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.025121041224338114, "learning_rate": 1.683330326285286e-05, "loss": 0.001, "num_tokens": 30223168.0, "reward": 1.375, "reward_std": 0.3284160792827606, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.328416109085083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 439.5, "completions/mean_terminated_length": 439.5, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.669433683822173, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.03377341921441257, "learning_rate": 1.6830952077300227e-05, "loss": 0.0014, "num_tokens": 30233700.0, "reward": 1.7159090042114258, "reward_std": 0.28927093744277954, "rewards/fixed_code_pass_all_test_reward/mean": 0.7159091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.28927096724510193, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 462.25, "completions/mean_terminated_length": 462.25, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.6696181516325401, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.03839342808350921, "learning_rate": 1.6828600183559914e-05, "loss": 0.0015, "num_tokens": 30242854.0, "reward": 1.100000023841858, "reward_std": 0.1069045215845108, "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, "rewards/fixed_code_pass_all_test_reward/std": 0.10690450668334961, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 415.75, "completions/mean_terminated_length": 415.75, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.6698026194429072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.040028820745646954, "learning_rate": 1.6826247581875744e-05, "loss": 0.0016, "num_tokens": 30255428.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6699870872532743, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.03933128924109042, "learning_rate": 1.6823894272491617e-05, "loss": 0.0016, "num_tokens": 30259516.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6701715550636413, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.06784466980025172, "learning_rate": 1.6821540255651512e-05, "loss": 0.0027, "num_tokens": 30263970.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6703560228740085, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.05190662620589137, "learning_rate": 1.681918553159948e-05, "loss": 0.0021, "num_tokens": 30272896.0, "reward": 1.683333396911621, "reward_std": 0.36121487617492676, "rewards/fixed_code_pass_all_test_reward/mean": 0.6833333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.36121487617492676, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6705404906843756, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.031712502939626575, "learning_rate": 1.6816830100579637e-05, "loss": 0.0013, "num_tokens": 30278369.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6707249584947427, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.04547358653508127, "learning_rate": 1.6814473962836184e-05, "loss": 0.0018, "num_tokens": 30285936.0, "reward": 1.0520833730697632, "reward_std": 0.06200197711586952, "rewards/fixed_code_pass_all_test_reward/mean": 0.0520833358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.06200198829174042, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6709094263051097, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.02471423684619367, "learning_rate": 1.6812117118613386e-05, "loss": 0.001, "num_tokens": 30290414.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6710938941154768, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.07697925763204694, "learning_rate": 1.6809759568155587e-05, "loss": 0.0031, "num_tokens": 30297224.0, "reward": 1.2916667461395264, "reward_std": 0.7000566720962524, "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 520.0, "completions/mean_terminated_length": 520.0, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.6712783619258439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0277099609375, "kl": 0.022352686617523432, "learning_rate": 1.6807401311707203e-05, "loss": 0.0009, "num_tokens": 30308480.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 366.0, "completions/mean_terminated_length": 366.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6714628297362111, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.08344901911914349, "learning_rate": 1.680504234951272e-05, "loss": 0.0033, "num_tokens": 30317008.0, "reward": 1.5, "reward_std": 0.47245559096336365, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6716472975465781, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.041772550670430064, "learning_rate": 1.6802682681816703e-05, "loss": 0.0017, "num_tokens": 30323353.0, "reward": 1.519230842590332, "reward_std": 0.05439284071326256, "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.054392825812101364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6718317653569452, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.064312475733459, "learning_rate": 1.680032230886378e-05, "loss": 0.0026, "num_tokens": 30331856.0, "reward": 1.734375, "reward_std": 0.33698704838752747, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.33698704838752747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6720162331673123, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.06125589879229665, "learning_rate": 1.6797961230898665e-05, "loss": 0.0025, "num_tokens": 30338856.0, "reward": 1.7437500953674316, "reward_std": 0.29693374037742615, "rewards/fixed_code_pass_all_test_reward/mean": 0.7437499761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6722007009776794, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.025104926084168255, "learning_rate": 1.6795599448166138e-05, "loss": 0.001, "num_tokens": 30343645.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6723851687880464, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.07905304967425764, "learning_rate": 1.679323696091105e-05, "loss": 0.0032, "num_tokens": 30351120.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6725696365984136, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "kl": 0.08563129277899861, "learning_rate": 1.6790873769378327e-05, "loss": 0.0034, "num_tokens": 30356710.0, "reward": 1.600000023841858, "reward_std": 0.4276179373264313, "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.42761799693107605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 674.75, "completions/mean_terminated_length": 674.75, "completions/min_length": 578.0, "completions/min_terminated_length": 578.0, "epoch": 0.6727541044087807, "frac_reward_zero_std": 0.0, "grad_norm": 0.74609375, "kl": 0.04026862815953791, "learning_rate": 1.6788509873812976e-05, "loss": 0.0016, "num_tokens": 30368948.0, "reward": 1.5625, "reward_std": 0.47087812423706055, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.47087812423706055, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6729385722191478, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.07020091847516596, "learning_rate": 1.6786145274460066e-05, "loss": 0.0028, "num_tokens": 30373654.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6731230400295148, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.018735545221716166, "learning_rate": 1.678377997156474e-05, "loss": 0.0007, "num_tokens": 30378931.0, "reward": 1.8888888359069824, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6733075078398819, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.03307704837061465, "learning_rate": 1.678141396537222e-05, "loss": 0.0013, "num_tokens": 30389091.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 409.375, "completions/mean_terminated_length": 409.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.673491975650249, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.05448441533371806, "learning_rate": 1.67790472561278e-05, "loss": 0.0022, "num_tokens": 30400118.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6736764434606162, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.03550602833274752, "learning_rate": 1.677667984407684e-05, "loss": 0.0014, "num_tokens": 30404728.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.6738609112709832, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.033871169900521636, "learning_rate": 1.6774311729464777e-05, "loss": 0.0014, "num_tokens": 30416701.0, "reward": 1.5, "reward_std": 0.022580957040190697, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.022580984979867935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6740453790813503, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.04331885394640267, "learning_rate": 1.6771942912537128e-05, "loss": 0.0017, "num_tokens": 30423210.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6742298468917174, "frac_reward_zero_std": 1.0, "grad_norm": 0.2275390625, "kl": 0.05692193889990449, "learning_rate": 1.6769573393539465e-05, "loss": 0.0023, "num_tokens": 30427717.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6744143147020845, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.03306740149855614, "learning_rate": 1.6767203172717457e-05, "loss": 0.0013, "num_tokens": 30437441.0, "reward": 1.9038461446762085, "reward_std": 0.2719641625881195, "rewards/fixed_code_pass_all_test_reward/mean": 0.9038461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6745987825124515, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.06323527102358639, "learning_rate": 1.6764832250316827e-05, "loss": 0.0025, "num_tokens": 30443021.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6747832503228187, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.07102045184001327, "learning_rate": 1.6762460626583378e-05, "loss": 0.0028, "num_tokens": 30450547.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6749677181331858, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.054579525254666805, "learning_rate": 1.6760088301762975e-05, "loss": 0.0022, "num_tokens": 30454990.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6751521859435529, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.033556015929207206, "learning_rate": 1.675771527610158e-05, "loss": 0.0013, "num_tokens": 30464288.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6753366537539199, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.03203866514377296, "learning_rate": 1.6755341549845198e-05, "loss": 0.0013, "num_tokens": 30469476.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.675521121564287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.03386988490819931, "learning_rate": 1.6752967123239933e-05, "loss": 0.0014, "num_tokens": 30473928.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6757055893746541, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.04499918804503977, "learning_rate": 1.6750591996531942e-05, "loss": 0.0018, "num_tokens": 30482087.0, "reward": 1.9795454740524292, "reward_std": 0.05785420909523964, "rewards/fixed_code_pass_all_test_reward/mean": 0.9795454740524292, "rewards/fixed_code_pass_all_test_reward/std": 0.05785420164465904, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 480.375, "completions/mean_terminated_length": 480.375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.6758900571850213, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.053676349343732, "learning_rate": 1.6748216169967465e-05, "loss": 0.0021, "num_tokens": 30496882.0, "reward": 1.519230842590332, "reward_std": 0.5147855877876282, "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.514785647392273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6760745249953883, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.054519183468073606, "learning_rate": 1.674583964379281e-05, "loss": 0.0022, "num_tokens": 30520343.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 391.75, "completions/mean_terminated_length": 391.75, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6762589928057554, "frac_reward_zero_std": 0.0, "grad_norm": 0.640625, "kl": 0.06618706090375781, "learning_rate": 1.674346241825437e-05, "loss": 0.0026, "num_tokens": 30527437.0, "reward": 1.8958332538604736, "reward_std": 0.294627845287323, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 328.125, "completions/mean_terminated_length": 328.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6764434606161225, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.08858846966177225, "learning_rate": 1.674108449359858e-05, "loss": 0.0035, "num_tokens": 30538302.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 630.125, "completions/mean_terminated_length": 630.125, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.6766279284264896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.040105503518134356, "learning_rate": 1.6738705870071986e-05, "loss": 0.0016, "num_tokens": 30554223.0, "reward": 1.2857142686843872, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6768123962368566, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.020336553105153143, "learning_rate": 1.6736326547921177e-05, "loss": 0.0008, "num_tokens": 30562034.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 477.875, "completions/mean_terminated_length": 477.875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.6769968640472238, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.045014480827376246, "learning_rate": 1.6733946527392832e-05, "loss": 0.0018, "num_tokens": 30574513.0, "reward": 1.375, "reward_std": 0.2653239965438843, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.2653239965438843, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6771813318575909, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.059637637343257666, "learning_rate": 1.673156580873369e-05, "loss": 0.0024, "num_tokens": 30580593.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 219.5, "completions/mean_terminated_length": 219.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.677365799667958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.029901932226493955, "learning_rate": 1.6729184392190575e-05, "loss": 0.0012, "num_tokens": 30585189.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 426.25, "completions/mean_terminated_length": 426.25, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.677550267478325, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.04053531982935965, "learning_rate": 1.6726802278010365e-05, "loss": 0.0016, "num_tokens": 30594663.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 379.25, "completions/mean_terminated_length": 379.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6777347352886921, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.06520006060600281, "learning_rate": 1.6724419466440035e-05, "loss": 0.0026, "num_tokens": 30604673.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 480.125, "completions/mean_terminated_length": 480.125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.6779192030990592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.039823418483138084, "learning_rate": 1.6722035957726607e-05, "loss": 0.0016, "num_tokens": 30613386.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 454.75, "completions/mean_terminated_length": 454.75, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6781036709094264, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.04387431056238711, "learning_rate": 1.6719651752117198e-05, "loss": 0.0018, "num_tokens": 30621880.0, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.6782881387197934, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.057102636666968465, "learning_rate": 1.6717266849858978e-05, "loss": 0.0023, "num_tokens": 30631581.0, "reward": 1.932692289352417, "reward_std": 0.1903749406337738, "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.19037489593029022, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.6784726065301605, "frac_reward_zero_std": 1.0, "grad_norm": 0.2099609375, "kl": 0.072473946493119, "learning_rate": 1.6714881251199204e-05, "loss": 0.0029, "num_tokens": 30636044.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 578.25, "completions/mean_terminated_length": 578.25, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.6786570743405276, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.033939379965886474, "learning_rate": 1.6712494956385195e-05, "loss": 0.0014, "num_tokens": 30648022.0, "reward": 1.1184210777282715, "reward_std": 0.059678610414266586, "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, "rewards/fixed_code_pass_all_test_reward/std": 0.05967859923839569, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6788415421508947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.05604859907180071, "learning_rate": 1.6710107965664354e-05, "loss": 0.0022, "num_tokens": 30656323.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 231.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6790260099612617, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.03550931951031089, "learning_rate": 1.6707720279284138e-05, "loss": 0.0014, "num_tokens": 30661702.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6792104777716288, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.054418426705524325, "learning_rate": 1.6705331897492088e-05, "loss": 0.0022, "num_tokens": 30672044.0, "reward": 1.3191488981246948, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3191489279270172, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.679394945581996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.018756062956526875, "learning_rate": 1.6702942820535823e-05, "loss": 0.0008, "num_tokens": 30676440.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 483.25, "completions/mean_terminated_length": 483.25, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.679579413392363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.05454242089763284, "learning_rate": 1.6700553048663014e-05, "loss": 0.0022, "num_tokens": 30687514.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 553.25, "completions/mean_terminated_length": 553.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6797638812027301, "frac_reward_zero_std": 0.0, "grad_norm": 0.6328125, "kl": 0.029599675559438765, "learning_rate": 1.669816258212143e-05, "loss": 0.0012, "num_tokens": 30699556.0, "reward": 1.5933098793029785, "reward_std": 0.4485257863998413, "rewards/fixed_code_pass_all_test_reward/mean": 0.7183098196983337, "rewards/fixed_code_pass_all_test_reward/std": 0.16698987782001495, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 625.125, "completions/mean_terminated_length": 625.125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.6799483490130972, "frac_reward_zero_std": 1.0, "grad_norm": 0.04443359375, "kl": 0.02558919822331518, "learning_rate": 1.6695771421158894e-05, "loss": 0.001, "num_tokens": 30713629.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 411.625, "completions/mean_terminated_length": 411.625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.6801328168234643, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.052797636250033975, "learning_rate": 1.66933795660233e-05, "loss": 0.0021, "num_tokens": 30721330.0, "reward": 1.900362253189087, "reward_std": 0.2818179428577423, "rewards/fixed_code_pass_all_test_reward/mean": 0.9003623127937317, "rewards/fixed_code_pass_all_test_reward/std": 0.2818179130554199, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6803172846338313, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.0236003496684134, "learning_rate": 1.669098701696263e-05, "loss": 0.0009, "num_tokens": 30725889.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6805017524441985, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.043364679673686624, "learning_rate": 1.6688593774224918e-05, "loss": 0.0017, "num_tokens": 30730625.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 396.375, "completions/mean_terminated_length": 396.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6806862202545656, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.05513603752478957, "learning_rate": 1.6686199838058284e-05, "loss": 0.0022, "num_tokens": 30742692.0, "reward": 1.2512136697769165, "reward_std": 0.0908447802066803, "rewards/fixed_code_pass_all_test_reward/mean": 0.25121361017227173, "rewards/fixed_code_pass_all_test_reward/std": 0.09084472805261612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6808706880649327, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.05261865328066051, "learning_rate": 1.6683805208710915e-05, "loss": 0.0021, "num_tokens": 30748894.0, "reward": 1.734375, "reward_std": 0.36659735441207886, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.36659735441207886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6810551558752997, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.05463936785236001, "learning_rate": 1.668140988643107e-05, "loss": 0.0022, "num_tokens": 30759506.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6812396236856668, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.04200354893691838, "learning_rate": 1.667901387146708e-05, "loss": 0.0017, "num_tokens": 30766830.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 403.375, "completions/mean_terminated_length": 403.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6814240914960339, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03389909607358277, "learning_rate": 1.6676617164067346e-05, "loss": 0.0014, "num_tokens": 30774169.0, "reward": 1.784482717514038, "reward_std": 0.06957854330539703, "rewards/fixed_code_pass_all_test_reward/mean": 0.7844827175140381, "rewards/fixed_code_pass_all_test_reward/std": 0.06957856565713882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 626.625, "completions/mean_terminated_length": 626.625, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.6816085593064011, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.04003201948944479, "learning_rate": 1.667421976448035e-05, "loss": 0.0016, "num_tokens": 30789254.0, "reward": 1.829545497894287, "reward_std": 0.3156205117702484, "rewards/fixed_code_pass_all_test_reward/mean": 0.8295454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.3156205117702484, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 477.75, "completions/mean_terminated_length": 477.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.6817930271167681, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.03479286341462284, "learning_rate": 1.6671821672954628e-05, "loss": 0.0014, "num_tokens": 30801772.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6819774949271352, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.07828248082660139, "learning_rate": 1.6669422889738807e-05, "loss": 0.0031, "num_tokens": 30807838.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6821619627375023, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.03397643659263849, "learning_rate": 1.666702341508157e-05, "loss": 0.0014, "num_tokens": 30812098.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6823464305478694, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.07354838983155787, "learning_rate": 1.6664623249231685e-05, "loss": 0.0029, "num_tokens": 30821282.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 355.0, "completions/mean_terminated_length": 355.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6825308983582364, "frac_reward_zero_std": 1.0, "grad_norm": 0.2470703125, "kl": 0.07811527559533715, "learning_rate": 1.6662222392437982e-05, "loss": 0.0031, "num_tokens": 30832650.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6827153661686036, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0605581016279757, "learning_rate": 1.6659820844949362e-05, "loss": 0.0024, "num_tokens": 30838402.0, "reward": 1.6349999904632568, "reward_std": 0.2622430622577667, "rewards/fixed_code_pass_all_test_reward/mean": 0.6349999904632568, "rewards/fixed_code_pass_all_test_reward/std": 0.2622430920600891, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6828998339789707, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.04834237997420132, "learning_rate": 1.6657418607014808e-05, "loss": 0.0019, "num_tokens": 30850270.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6830843017893378, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.043270950205624104, "learning_rate": 1.6655015678883365e-05, "loss": 0.0017, "num_tokens": 30855620.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 655.625, "completions/mean_terminated_length": 655.625, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.6832687695997048, "frac_reward_zero_std": 0.0, "grad_norm": 0.6640625, "kl": 0.02660720539279282, "learning_rate": 1.665261206080415e-05, "loss": 0.0011, "num_tokens": 30872817.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6834532374100719, "frac_reward_zero_std": 0.0, "grad_norm": 0.65625, "kl": 0.017618111392948776, "learning_rate": 1.6650207753026366e-05, "loss": 0.0007, "num_tokens": 30878717.0, "reward": 1.8958332538604736, "reward_std": 0.294627845287323, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.683637705220439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.03448926075361669, "learning_rate": 1.6647802755799258e-05, "loss": 0.0014, "num_tokens": 30886880.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 392.5, "completions/mean_terminated_length": 392.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6838221730308062, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.07361371186561882, "learning_rate": 1.6645397069372175e-05, "loss": 0.0029, "num_tokens": 30896676.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 285.5, "completions/mean_terminated_length": 285.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6840066408411732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0537109375, "kl": 0.023489110986702144, "learning_rate": 1.664299069399451e-05, "loss": 0.0009, "num_tokens": 30906568.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6841911086515403, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.07241898169741035, "learning_rate": 1.664058362991575e-05, "loss": 0.0029, "num_tokens": 30912844.0, "reward": 1.517045497894287, "reward_std": 0.23673485219478607, "rewards/fixed_code_pass_all_test_reward/mean": 0.5170454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.23673486709594727, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 443.625, "completions/mean_terminated_length": 443.625, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.6843755764619074, "frac_reward_zero_std": 0.0, "grad_norm": 0.88671875, "kl": 0.036237545078620315, "learning_rate": 1.6638175877385442e-05, "loss": 0.0014, "num_tokens": 30921545.0, "reward": 1.375, "reward_std": 0.25253811478614807, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6845600442722745, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.060210245195776224, "learning_rate": 1.66357674366532e-05, "loss": 0.0024, "num_tokens": 30930217.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 348.125, "completions/mean_terminated_length": 348.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.6847445120826415, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.0596439428627491, "learning_rate": 1.6633358307968722e-05, "loss": 0.0024, "num_tokens": 30938858.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 378.0, "completions/mean_terminated_length": 378.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6849289798930087, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04982957674656063, "learning_rate": 1.6630948491581763e-05, "loss": 0.002, "num_tokens": 30946562.0, "reward": 1.9134615659713745, "reward_std": 0.12462963908910751, "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.12462963163852692, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6851134477033758, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.044732645619660616, "learning_rate": 1.6628537987742165e-05, "loss": 0.0018, "num_tokens": 30958557.0, "reward": 1.9861111640930176, "reward_std": 0.03928373008966446, "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6852979155137429, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.04951454885303974, "learning_rate": 1.6626126796699828e-05, "loss": 0.002, "num_tokens": 30967930.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6854823833241099, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.06312663108110428, "learning_rate": 1.6623714918704728e-05, "loss": 0.0025, "num_tokens": 30974876.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 584.75, "completions/mean_terminated_length": 584.75, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.685666851134477, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.028705685050226748, "learning_rate": 1.6621302354006915e-05, "loss": 0.0011, "num_tokens": 30985450.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 738.0, "completions/mean_terminated_length": 738.0, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 0.6858513189448441, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.02508676890283823, "learning_rate": 1.6618889102856506e-05, "loss": 0.001, "num_tokens": 31002466.0, "reward": 1.4444444179534912, "reward_std": 0.4663894474506378, "rewards/fixed_code_pass_all_test_reward/mean": 0.4444444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.4663894772529602, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 498.0, "completions/mean_terminated_length": 498.0, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.6860357867552113, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.10521011147648096, "learning_rate": 1.661647516550369e-05, "loss": 0.0042, "num_tokens": 31011386.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6862202545655783, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.02660631516482681, "learning_rate": 1.661406054219873e-05, "loss": 0.0011, "num_tokens": 31016386.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.6864047223759454, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.059233935084193945, "learning_rate": 1.6611645233191957e-05, "loss": 0.0024, "num_tokens": 31027163.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 372.25, "completions/mean_terminated_length": 372.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.6865891901863125, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.038138950476422906, "learning_rate": 1.6609229238733776e-05, "loss": 0.0015, "num_tokens": 31036197.0, "reward": 1.756250023841858, "reward_std": 0.31672146916389465, "rewards/fixed_code_pass_all_test_reward/mean": 0.7562500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.31672149896621704, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 449.875, "completions/mean_terminated_length": 449.875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6867736579966796, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.03688846342265606, "learning_rate": 1.660681255907466e-05, "loss": 0.0015, "num_tokens": 31047116.0, "reward": 1.7697367668151855, "reward_std": 0.017285054549574852, "rewards/fixed_code_pass_all_test_reward/mean": 0.7697368264198303, "rewards/fixed_code_pass_all_test_reward/std": 0.0172850601375103, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6869581258070466, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.038218742702156305, "learning_rate": 1.660439519446515e-05, "loss": 0.0015, "num_tokens": 31055852.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6871425936174138, "frac_reward_zero_std": 0.0, "grad_norm": 0.79296875, "kl": 0.03660299978218973, "learning_rate": 1.660197714515587e-05, "loss": 0.0015, "num_tokens": 31063051.0, "reward": 1.9673912525177002, "reward_std": 0.09223129600286484, "rewards/fixed_code_pass_all_test_reward/mean": 0.967391312122345, "rewards/fixed_code_pass_all_test_reward/std": 0.09223131835460663, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 599.25, "completions/mean_terminated_length": 599.25, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.6873270614277809, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.03879402298480272, "learning_rate": 1.65995584113975e-05, "loss": 0.0016, "num_tokens": 31076405.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 394.375, "completions/mean_terminated_length": 394.375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.687511529238148, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.05983232660219073, "learning_rate": 1.65971389934408e-05, "loss": 0.0024, "num_tokens": 31085816.0, "reward": 1.6486486196517944, "reward_std": 0.05004430189728737, "rewards/fixed_code_pass_all_test_reward/mean": 0.6486486196517944, "rewards/fixed_code_pass_all_test_reward/std": 0.050044331699609756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.687695997048515, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.05465828743763268, "learning_rate": 1.659471889153661e-05, "loss": 0.0022, "num_tokens": 31091249.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 416.625, "completions/mean_terminated_length": 416.625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.6878804648588821, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.040180700132623315, "learning_rate": 1.6592298105935814e-05, "loss": 0.0016, "num_tokens": 31100854.0, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.6880649326692492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0546875, "kl": 0.04227416682988405, "learning_rate": 1.6589876636889392e-05, "loss": 0.0017, "num_tokens": 31109759.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6882494004796164, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.11573283281177282, "learning_rate": 1.658745448464838e-05, "loss": 0.0046, "num_tokens": 31118129.0, "reward": 1.9419643878936768, "reward_std": 0.06313452869653702, "rewards/fixed_code_pass_all_test_reward/mean": 0.9419642686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.06313455104827881, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.6884338682899834, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.03768581850454211, "learning_rate": 1.65850316494639e-05, "loss": 0.0015, "num_tokens": 31124788.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 699.75, "completions/mean_terminated_length": 699.75, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.6886183361003505, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.03282665414735675, "learning_rate": 1.658260813158713e-05, "loss": 0.0013, "num_tokens": 31139770.0, "reward": 1.7788461446762085, "reward_std": 0.41411587595939636, "rewards/fixed_code_pass_all_test_reward/mean": 0.7788461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.41411590576171875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6888028039107176, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.02671283797826618, "learning_rate": 1.6580183931269318e-05, "loss": 0.0011, "num_tokens": 31148548.0, "reward": 1.8041666746139526, "reward_std": 0.27970364689826965, "rewards/fixed_code_pass_all_test_reward/mean": 0.8041666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.27970364689826965, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 382.0, "completions/mean_terminated_length": 382.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6889872717210846, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.059421731159090996, "learning_rate": 1.65777590487618e-05, "loss": 0.0024, "num_tokens": 31156820.0, "reward": 1.6836419105529785, "reward_std": 0.26196935772895813, "rewards/fixed_code_pass_all_test_reward/mean": 0.6836419701576233, "rewards/fixed_code_pass_all_test_reward/std": 0.26196932792663574, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6891717395314517, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.05504172947257757, "learning_rate": 1.6575333484315964e-05, "loss": 0.0022, "num_tokens": 31160768.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 442.375, "completions/mean_terminated_length": 442.375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.6893562073418189, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.022291914792731404, "learning_rate": 1.657290723818328e-05, "loss": 0.0009, "num_tokens": 31173811.0, "reward": 1.6229338645935059, "reward_std": 0.13969051837921143, "rewards/fixed_code_pass_all_test_reward/mean": 0.6229339241981506, "rewards/fixed_code_pass_all_test_reward/std": 0.1396905481815338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.689540675152186, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.0274247172055766, "learning_rate": 1.6570480310615285e-05, "loss": 0.0011, "num_tokens": 31180298.0, "reward": 1.1875, "reward_std": 0.3282996118068695, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3282995820045471, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.689725142962553, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.08229600777849555, "learning_rate": 1.6568052701863586e-05, "loss": 0.0033, "num_tokens": 31189074.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.6899096107729201, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.029903542017564178, "learning_rate": 1.656562441217986e-05, "loss": 0.0012, "num_tokens": 31194415.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 412.5, "completions/mean_terminated_length": 412.5, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6900940785832872, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.04061175766400993, "learning_rate": 1.6563195441815855e-05, "loss": 0.0016, "num_tokens": 31206131.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.6902785463936543, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.022111187456175685, "learning_rate": 1.6560765791023395e-05, "loss": 0.0009, "num_tokens": 31213926.0, "reward": 1.2000000476837158, "reward_std": 0.38544961810112, "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6904630142040215, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.04408752894960344, "learning_rate": 1.6558335460054367e-05, "loss": 0.0018, "num_tokens": 31218947.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6906474820143885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.04815824655815959, "learning_rate": 1.6555904449160734e-05, "loss": 0.0019, "num_tokens": 31226260.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6908319498247556, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.05263505480252206, "learning_rate": 1.6553472758594522e-05, "loss": 0.0021, "num_tokens": 31233432.0, "reward": 1.875, "reward_std": 0.23754699528217316, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.23754698038101196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.6910164176351227, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.042811705032363534, "learning_rate": 1.6551040388607838e-05, "loss": 0.0017, "num_tokens": 31238175.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 378.375, "completions/mean_terminated_length": 378.375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.6912008854454897, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.025162800797261298, "learning_rate": 1.6548607339452853e-05, "loss": 0.001, "num_tokens": 31245602.0, "reward": 1.4074074029922485, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.40740740299224854, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 356.0, "completions/mean_terminated_length": 356.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6913853532558568, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.04650401254184544, "learning_rate": 1.6546173611381805e-05, "loss": 0.0019, "num_tokens": 31255274.0, "reward": 1.8125, "reward_std": 0.33407655358314514, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.6915698210662239, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.05231584212742746, "learning_rate": 1.654373920464701e-05, "loss": 0.0021, "num_tokens": 31263923.0, "reward": 1.1875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 390.875, "completions/mean_terminated_length": 390.875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.6917542888765911, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.034739643801003695, "learning_rate": 1.6541304119500853e-05, "loss": 0.0014, "num_tokens": 31271290.0, "reward": 1.7406914234161377, "reward_std": 0.01880604401230812, "rewards/fixed_code_pass_all_test_reward/mean": 0.7406914830207825, "rewards/fixed_code_pass_all_test_reward/std": 0.018806051462888718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 555.75, "completions/mean_terminated_length": 555.75, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.6919387566869581, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.023392248433083296, "learning_rate": 1.6538868356195787e-05, "loss": 0.0009, "num_tokens": 31285776.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 737.125, "completions/mean_terminated_length": 737.125, "completions/min_length": 606.0, "completions/min_terminated_length": 606.0, "epoch": 0.6921232244973252, "frac_reward_zero_std": 1.0, "grad_norm": 0.040771484375, "kl": 0.021976477495627478, "learning_rate": 1.653643191498433e-05, "loss": 0.0009, "num_tokens": 31302281.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.6923076923076923, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.04640924069099128, "learning_rate": 1.653399479611908e-05, "loss": 0.0019, "num_tokens": 31309894.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6924921601180594, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.030307766748592257, "learning_rate": 1.6531556999852703e-05, "loss": 0.0012, "num_tokens": 31321237.0, "reward": 1.8562500476837158, "reward_std": 0.23366260528564453, "rewards/fixed_code_pass_all_test_reward/mean": 0.856249988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.23366262018680573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6926766279284264, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.0664343957323581, "learning_rate": 1.652911852643793e-05, "loss": 0.0027, "num_tokens": 31327390.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6928610957387936, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.039893782464787364, "learning_rate": 1.652667937612757e-05, "loss": 0.0016, "num_tokens": 31334312.0, "reward": 1.3333333730697632, "reward_std": 0.35634827613830566, "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.35634830594062805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 408.25, "completions/mean_terminated_length": 408.25, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6930455635491607, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.03531626146286726, "learning_rate": 1.6524239549174496e-05, "loss": 0.0014, "num_tokens": 31348082.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.6932300313595278, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05047107767313719, "learning_rate": 1.652179904583165e-05, "loss": 0.002, "num_tokens": 31352222.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 699.25, "completions/mean_terminated_length": 699.25, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.6934144991698948, "frac_reward_zero_std": 0.0, "grad_norm": 0.60546875, "kl": 0.022191372932866216, "learning_rate": 1.651935786635205e-05, "loss": 0.0009, "num_tokens": 31364880.0, "reward": 1.7757353782653809, "reward_std": 0.22887110710144043, "rewards/fixed_code_pass_all_test_reward/mean": 0.7757352590560913, "rewards/fixed_code_pass_all_test_reward/std": 0.22887110710144043, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6935989669802619, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.024636907037347555, "learning_rate": 1.6516916010988784e-05, "loss": 0.001, "num_tokens": 31369053.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 314.875, "completions/mean_terminated_length": 314.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.693783434790629, "frac_reward_zero_std": 1.0, "grad_norm": 0.055419921875, "kl": 0.05058677773922682, "learning_rate": 1.6514473479995003e-05, "loss": 0.002, "num_tokens": 31377700.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.6939679026009962, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.03490825439803302, "learning_rate": 1.651203027362393e-05, "loss": 0.0014, "num_tokens": 31385592.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 381.5, "completions/mean_terminated_length": 381.5, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6941523704113632, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.025275283493101597, "learning_rate": 1.6509586392128865e-05, "loss": 0.001, "num_tokens": 31392868.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6943368382217303, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.03417039941996336, "learning_rate": 1.6507141835763173e-05, "loss": 0.0014, "num_tokens": 31402306.0, "reward": 1.4595588445663452, "reward_std": 0.15318132936954498, "rewards/fixed_code_pass_all_test_reward/mean": 0.4595588445663452, "rewards/fixed_code_pass_all_test_reward/std": 0.15318137407302856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6945213060320974, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0877725169993937, "learning_rate": 1.650469660478029e-05, "loss": 0.0035, "num_tokens": 31409622.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 590.125, "completions/mean_terminated_length": 590.125, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.6947057738424645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.03946866886690259, "learning_rate": 1.650225069943372e-05, "loss": 0.0016, "num_tokens": 31425255.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 383.875, "completions/mean_terminated_length": 383.875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6948902416528315, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.058194668497890234, "learning_rate": 1.649980411997704e-05, "loss": 0.0023, "num_tokens": 31432806.0, "reward": 1.8571429252624512, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 331.5, "completions/mean_terminated_length": 331.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6950747094631987, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "kl": 0.08639300125651062, "learning_rate": 1.649735686666389e-05, "loss": 0.0035, "num_tokens": 31439122.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6952591772735658, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05590819241479039, "learning_rate": 1.649490893974799e-05, "loss": 0.0022, "num_tokens": 31449501.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.6954436450839329, "frac_reward_zero_std": 0.0, "grad_norm": 3.09375, "kl": 0.08549389196559787, "learning_rate": 1.649246033948312e-05, "loss": 0.0034, "num_tokens": 31453356.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6956281128942999, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.05683800647966564, "learning_rate": 1.649001106612314e-05, "loss": 0.0023, "num_tokens": 31461892.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.695812580704667, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.048190937377512455, "learning_rate": 1.648756111992197e-05, "loss": 0.0019, "num_tokens": 31470426.0, "reward": 1.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 313.25, "completions/mean_terminated_length": 313.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6959970485150341, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06945065618492663, "learning_rate": 1.648511050113361e-05, "loss": 0.0028, "num_tokens": 31479740.0, "reward": 1.8023256063461304, "reward_std": 0.3662329316139221, "rewards/fixed_code_pass_all_test_reward/mean": 0.8023256063461304, "rewards/fixed_code_pass_all_test_reward/std": 0.3662329614162445, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6961815163254013, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.05162624339573085, "learning_rate": 1.6482659210012117e-05, "loss": 0.0021, "num_tokens": 31489240.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6963659841357683, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.07444684160873294, "learning_rate": 1.648020724681163e-05, "loss": 0.003, "num_tokens": 31493477.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6965504519461354, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.02291830931790173, "learning_rate": 1.647775461178635e-05, "loss": 0.0009, "num_tokens": 31499067.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 683.625, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.6967349197565025, "frac_reward_zero_std": 0.0, "grad_norm": 0.68359375, "kl": 0.034360678400844336, "learning_rate": 1.6475301305190546e-05, "loss": 0.0014, "num_tokens": 31515248.0, "reward": 1.9659091234207153, "reward_std": 0.09642363339662552, "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6969193875668696, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.055777879199013114, "learning_rate": 1.6472847327278563e-05, "loss": 0.0022, "num_tokens": 31523157.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 285.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6971038553772366, "frac_reward_zero_std": 1.0, "grad_norm": 0.05517578125, "kl": 0.0304083910305053, "learning_rate": 1.647039267830482e-05, "loss": 0.0012, "num_tokens": 31531507.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 534.0, "completions/mean_terminated_length": 534.0, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.6972883231876038, "frac_reward_zero_std": 0.0, "grad_norm": 0.85546875, "kl": 0.038748868741095066, "learning_rate": 1.6467937358523788e-05, "loss": 0.0016, "num_tokens": 31540955.0, "reward": 1.4821429252624512, "reward_std": 0.9161254167556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6974727909979709, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.04506985703483224, "learning_rate": 1.6465481368190025e-05, "loss": 0.0018, "num_tokens": 31545157.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.697657258808338, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.02738941623829305, "learning_rate": 1.646302470755815e-05, "loss": 0.0011, "num_tokens": 31551819.0, "reward": 1.4886363744735718, "reward_std": 0.30321967601776123, "rewards/fixed_code_pass_all_test_reward/mean": 0.4886363446712494, "rewards/fixed_code_pass_all_test_reward/std": 0.30321964621543884, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 404.125, "completions/mean_terminated_length": 404.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.697841726618705, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.039601166266947985, "learning_rate": 1.6460567376882854e-05, "loss": 0.0016, "num_tokens": 31563508.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6980261944290721, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04861987382173538, "learning_rate": 1.6458109376418896e-05, "loss": 0.0019, "num_tokens": 31571212.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 365.125, "completions/mean_terminated_length": 365.125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6982106622394392, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.050927909556776285, "learning_rate": 1.6455650706421103e-05, "loss": 0.002, "num_tokens": 31578725.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6983951300498064, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.04488826100714505, "learning_rate": 1.6453191367144377e-05, "loss": 0.0018, "num_tokens": 31583440.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6985795978601734, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.061344979563727975, "learning_rate": 1.6450731358843685e-05, "loss": 0.0025, "num_tokens": 31594916.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6987640656705405, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.04234885121695697, "learning_rate": 1.6448270681774062e-05, "loss": 0.0017, "num_tokens": 31603753.0, "reward": 1.2941176891326904, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.29411765933036804, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6989485334809076, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.06471045897342265, "learning_rate": 1.6445809336190618e-05, "loss": 0.0026, "num_tokens": 31613947.0, "reward": 1.9500000476837158, "reward_std": 0.1414213627576828, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6991330012912746, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.07278218492865562, "learning_rate": 1.6443347322348526e-05, "loss": 0.0029, "num_tokens": 31623323.0, "reward": 1.8899999856948853, "reward_std": 0.3111270070075989, "rewards/fixed_code_pass_all_test_reward/mean": 0.8899999856948853, "rewards/fixed_code_pass_all_test_reward/std": 0.3111269772052765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6993174691016417, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.046646937960758805, "learning_rate": 1.6440884640503035e-05, "loss": 0.0019, "num_tokens": 31632084.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 394.625, "completions/mean_terminated_length": 394.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6995019369120089, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.048393404576927423, "learning_rate": 1.6438421290909453e-05, "loss": 0.0019, "num_tokens": 31643505.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 3792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 416.75, "completions/mean_terminated_length": 416.75, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.699686404722376, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.03222534840460867, "learning_rate": 1.6435957273823172e-05, "loss": 0.0013, "num_tokens": 31652015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.699870872532743, "frac_reward_zero_std": 1.0, "grad_norm": 0.037353515625, "kl": 0.02264452597592026, "learning_rate": 1.643349258949964e-05, "loss": 0.0009, "num_tokens": 31657683.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 352.125, "completions/mean_terminated_length": 352.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7000553403431101, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.038124822080135345, "learning_rate": 1.643102723819438e-05, "loss": 0.0015, "num_tokens": 31665164.0, "reward": 1.855263113975525, "reward_std": 0.2738432288169861, "rewards/fixed_code_pass_all_test_reward/mean": 0.8552631139755249, "rewards/fixed_code_pass_all_test_reward/std": 0.2738432288169861, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.7002398081534772, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.03819207020569593, "learning_rate": 1.6428561220162983e-05, "loss": 0.0015, "num_tokens": 31674378.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7004242759638443, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.04304635152220726, "learning_rate": 1.6426094535661104e-05, "loss": 0.0017, "num_tokens": 31681906.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7006087437742115, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.06083651352673769, "learning_rate": 1.6423627184944484e-05, "loss": 0.0024, "num_tokens": 31691703.0, "reward": 1.682692289352417, "reward_std": 0.42345142364501953, "rewards/fixed_code_pass_all_test_reward/mean": 0.682692289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.42345142364501953, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7007932115845785, "frac_reward_zero_std": 0.0, "grad_norm": 3.484375, "kl": 0.06564672710373998, "learning_rate": 1.6421159168268915e-05, "loss": 0.0026, "num_tokens": 31697621.0, "reward": 1.4375, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 434.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7009776793949456, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.029246112098917365, "learning_rate": 1.641869048589026e-05, "loss": 0.0012, "num_tokens": 31707345.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.7011621472053127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.0412779722828418, "learning_rate": 1.6416221138064464e-05, "loss": 0.0017, "num_tokens": 31715974.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.7013466150156797, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.01906467555090785, "learning_rate": 1.641375112504753e-05, "loss": 0.0008, "num_tokens": 31720759.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 686.625, "completions/mean_terminated_length": 686.625, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "epoch": 0.7015310828260468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0220947265625, "kl": 0.018477863166481256, "learning_rate": 1.641128044709553e-05, "loss": 0.0007, "num_tokens": 31733588.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 575.625, "completions/mean_terminated_length": 575.625, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.701715550636414, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.03571358881890774, "learning_rate": 1.640880910446461e-05, "loss": 0.0014, "num_tokens": 31744777.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7019000184467811, "frac_reward_zero_std": 1.0, "grad_norm": 0.05517578125, "kl": 0.029027531621977687, "learning_rate": 1.640633709741098e-05, "loss": 0.0012, "num_tokens": 31750945.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7020844862571481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.030808203504420817, "learning_rate": 1.6403864426190925e-05, "loss": 0.0012, "num_tokens": 31754592.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7022689540675152, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.0548455259995535, "learning_rate": 1.640139109106079e-05, "loss": 0.0022, "num_tokens": 31764145.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7024534218778823, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.04162910836748779, "learning_rate": 1.6398917092277e-05, "loss": 0.0017, "num_tokens": 31768953.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7026378896882494, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.04900571936741471, "learning_rate": 1.6396442430096032e-05, "loss": 0.002, "num_tokens": 31777107.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 323.375, "completions/mean_terminated_length": 323.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7028223574986165, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.022142526227980852, "learning_rate": 1.6393967104774458e-05, "loss": 0.0009, "num_tokens": 31785790.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7030068253089836, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.05158341769129038, "learning_rate": 1.639149111656889e-05, "loss": 0.0021, "num_tokens": 31791673.0, "reward": 1.4728260040283203, "reward_std": 0.015371894463896751, "rewards/fixed_code_pass_all_test_reward/mean": 0.4728260934352875, "rewards/fixed_code_pass_all_test_reward/std": 0.015371893532574177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7031912931193507, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0340567163657397, "learning_rate": 1.638901446573603e-05, "loss": 0.0014, "num_tokens": 31795821.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7033757609297178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.03761658305302262, "learning_rate": 1.6386537152532637e-05, "loss": 0.0015, "num_tokens": 31805121.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.7035602287400848, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.02764666045550257, "learning_rate": 1.6384059177215544e-05, "loss": 0.0011, "num_tokens": 31808930.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7037446965504519, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.039056660141795874, "learning_rate": 1.6381580540041652e-05, "loss": 0.0016, "num_tokens": 31817249.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.703929164360819, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.06562286498956382, "learning_rate": 1.6379101241267923e-05, "loss": 0.0026, "num_tokens": 31825330.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7041136321711862, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.04143834952265024, "learning_rate": 1.63766212811514e-05, "loss": 0.0017, "num_tokens": 31833140.0, "reward": 1.1691176891326904, "reward_std": 0.020797276869416237, "rewards/fixed_code_pass_all_test_reward/mean": 0.16911765933036804, "rewards/fixed_code_pass_all_test_reward/std": 0.020797260105609894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7042980999815532, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.06332773389294744, "learning_rate": 1.6374140659949193e-05, "loss": 0.0025, "num_tokens": 31837911.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7044825677919203, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.05057633062824607, "learning_rate": 1.6371659377918466e-05, "loss": 0.002, "num_tokens": 31845543.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 490.875, "completions/mean_terminated_length": 490.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7046670356022874, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.03673546458594501, "learning_rate": 1.6369177435316465e-05, "loss": 0.0015, "num_tokens": 31857318.0, "reward": 1.4715908765792847, "reward_std": 0.32884496450424194, "rewards/fixed_code_pass_all_test_reward/mean": 0.47159093618392944, "rewards/fixed_code_pass_all_test_reward/std": 0.32884499430656433, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7048515034126545, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.04988833353854716, "learning_rate": 1.6366694832400508e-05, "loss": 0.002, "num_tokens": 31865065.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.7050359712230215, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.07212846912443638, "learning_rate": 1.636421156942797e-05, "loss": 0.0029, "num_tokens": 31873914.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7052204390333887, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.06595373246818781, "learning_rate": 1.63617276466563e-05, "loss": 0.0026, "num_tokens": 31881726.0, "reward": 1.7386362552642822, "reward_std": 0.34850838780403137, "rewards/fixed_code_pass_all_test_reward/mean": 0.7386363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.348508358001709, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7054049068437558, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.06758049596101046, "learning_rate": 1.635924306434301e-05, "loss": 0.0027, "num_tokens": 31885922.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 207.625, "completions/mean_terminated_length": 207.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7055893746541229, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.05185369309037924, "learning_rate": 1.6356757822745692e-05, "loss": 0.0021, "num_tokens": 31893559.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7057738424644899, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.054493971867486835, "learning_rate": 1.6354271922121992e-05, "loss": 0.0022, "num_tokens": 31901223.0, "reward": 1.8897058963775635, "reward_std": 0.31195884943008423, "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 284.0, "completions/mean_terminated_length": 284.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.705958310274857, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638671875, "kl": 0.039790594251826406, "learning_rate": 1.635178536272964e-05, "loss": 0.0016, "num_tokens": 31911239.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.7061427780852241, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.055719359777867794, "learning_rate": 1.634929814482642e-05, "loss": 0.0022, "num_tokens": 31917486.0, "reward": 1.875, "reward_std": 0.2829941511154175, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.28299421072006226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 446.375, "completions/mean_terminated_length": 446.375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.7063272458955913, "frac_reward_zero_std": 1.0, "grad_norm": 0.03955078125, "kl": 0.02822624403052032, "learning_rate": 1.634681026867019e-05, "loss": 0.0011, "num_tokens": 31925697.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7065117137059583, "frac_reward_zero_std": 1.0, "grad_norm": 1.125, "kl": 0.16608937783166766, "learning_rate": 1.6344321734518884e-05, "loss": 0.0066, "num_tokens": 31933677.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7066961815163254, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.08262318256311119, "learning_rate": 1.6341832542630486e-05, "loss": 0.0033, "num_tokens": 31942453.0, "reward": 1.625, "reward_std": 0.27319857478141785, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.27319860458374023, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7068806493266925, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.08549753716215491, "learning_rate": 1.6339342693263067e-05, "loss": 0.0034, "num_tokens": 31951370.0, "reward": 1.653846263885498, "reward_std": 0.477737694978714, "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.477737694978714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7070651171370596, "frac_reward_zero_std": 1.0, "grad_norm": 0.2216796875, "kl": 0.033953696954995394, "learning_rate": 1.633685218667475e-05, "loss": 0.0014, "num_tokens": 31956198.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7072495849474266, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.059018890373408794, "learning_rate": 1.6334361023123743e-05, "loss": 0.0024, "num_tokens": 31967770.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 395.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7074340527577938, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.031166540924459696, "learning_rate": 1.6331869202868308e-05, "loss": 0.0012, "num_tokens": 31975931.0, "reward": 1.6750000715255737, "reward_std": 0.348124235868454, "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.348124235868454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7076185205681609, "frac_reward_zero_std": 1.0, "grad_norm": 0.29296875, "kl": 0.049973453977145255, "learning_rate": 1.632937672616678e-05, "loss": 0.002, "num_tokens": 31979966.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.707802988378528, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.03371069743297994, "learning_rate": 1.6326883593277568e-05, "loss": 0.0013, "num_tokens": 31987992.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 289.375, "completions/mean_terminated_length": 289.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.707987456188895, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.052454553777351975, "learning_rate": 1.632438980445914e-05, "loss": 0.0021, "num_tokens": 31994459.0, "reward": 1.5163042545318604, "reward_std": 0.424370676279068, "rewards/fixed_code_pass_all_test_reward/mean": 0.5163043737411499, "rewards/fixed_code_pass_all_test_reward/std": 0.4243707060813904, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7081719239992621, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.04932348383590579, "learning_rate": 1.632189535997003e-05, "loss": 0.002, "num_tokens": 32001327.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7083563918096292, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.014756768650840968, "learning_rate": 1.6319400260068854e-05, "loss": 0.0006, "num_tokens": 32007726.0, "reward": 1.5443549156188965, "reward_std": 0.25658419728279114, "rewards/fixed_code_pass_all_test_reward/mean": 0.5443547964096069, "rewards/fixed_code_pass_all_test_reward/std": 0.25658416748046875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7085408596199964, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.05971781606785953, "learning_rate": 1.631690450501428e-05, "loss": 0.0024, "num_tokens": 32015195.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 519.125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.7087253274303634, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.04590933315921575, "learning_rate": 1.6314408095065062e-05, "loss": 0.0018, "num_tokens": 32026684.0, "reward": 1.75, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7089097952407305, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.035430294228717685, "learning_rate": 1.631191103048e-05, "loss": 0.0014, "num_tokens": 32036648.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7090942630510976, "frac_reward_zero_std": 0.0, "grad_norm": 5.09375, "kl": 0.35876134247519076, "learning_rate": 1.6309413311517975e-05, "loss": 0.0144, "num_tokens": 32044927.0, "reward": 1.2619047164916992, "reward_std": 0.42667755484580994, "rewards/fixed_code_pass_all_test_reward/mean": 0.261904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.4266776144504547, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7092787308614646, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.01867935643531382, "learning_rate": 1.6306914938437943e-05, "loss": 0.0007, "num_tokens": 32051159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 515.375, "completions/mean_terminated_length": 515.375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.7094631986718317, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.021954603493213654, "learning_rate": 1.6304415911498907e-05, "loss": 0.0009, "num_tokens": 32061418.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 407.75, "completions/mean_terminated_length": 407.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7096476664821989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.04198176274076104, "learning_rate": 1.6301916230959953e-05, "loss": 0.0017, "num_tokens": 32067720.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 526.5, "completions/mean_terminated_length": 526.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.709832134292566, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.05899894016329199, "learning_rate": 1.6299415897080234e-05, "loss": 0.0024, "num_tokens": 32080916.0, "reward": 1.59375, "reward_std": 0.4419417381286621, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.4419417679309845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.710016602102933, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.042614942183718085, "learning_rate": 1.629691491011897e-05, "loss": 0.0017, "num_tokens": 32087890.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7102010699133001, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.06426581903360784, "learning_rate": 1.6294413270335437e-05, "loss": 0.0026, "num_tokens": 32094315.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7103855377236672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.06572533422149718, "learning_rate": 1.6291910977988998e-05, "loss": 0.0026, "num_tokens": 32101343.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7105700055340343, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.10688682459294796, "learning_rate": 1.6289408033339073e-05, "loss": 0.0043, "num_tokens": 32108803.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7107544733444014, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.02985777670983225, "learning_rate": 1.6286904436645145e-05, "loss": 0.0012, "num_tokens": 32113760.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7109389411547685, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.028253308322746307, "learning_rate": 1.6284400188166776e-05, "loss": 0.0011, "num_tokens": 32118768.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7111234089651356, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.04902401682920754, "learning_rate": 1.6281895288163587e-05, "loss": 0.002, "num_tokens": 32127598.0, "reward": 1.9010417461395264, "reward_std": 0.27989640831947327, "rewards/fixed_code_pass_all_test_reward/mean": 0.9010416865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.27989643812179565, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.7113078767755027, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.07027927599847317, "learning_rate": 1.627938973689527e-05, "loss": 0.0028, "num_tokens": 32134049.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7114923445858697, "frac_reward_zero_std": 1.0, "grad_norm": 0.0322265625, "kl": 0.01673012087121606, "learning_rate": 1.6276883534621582e-05, "loss": 0.0007, "num_tokens": 32141102.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7116768123962368, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.05023044999688864, "learning_rate": 1.6274376681602353e-05, "loss": 0.002, "num_tokens": 32145625.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.711861280206604, "frac_reward_zero_std": 1.0, "grad_norm": 0.04345703125, "kl": 0.03454492520540953, "learning_rate": 1.6271869178097474e-05, "loss": 0.0014, "num_tokens": 32153640.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 251.25, "completions/mean_terminated_length": 251.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7120457480169711, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.04574429360218346, "learning_rate": 1.626936102436691e-05, "loss": 0.0018, "num_tokens": 32159578.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7122302158273381, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.0628775724908337, "learning_rate": 1.626685222067068e-05, "loss": 0.0025, "num_tokens": 32163938.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7124146836377052, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.04103571688756347, "learning_rate": 1.6264342767268892e-05, "loss": 0.0016, "num_tokens": 32169540.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.7125991514480723, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.04030704067554325, "learning_rate": 1.6261832664421705e-05, "loss": 0.0016, "num_tokens": 32176048.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7127836192584394, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.05137320491485298, "learning_rate": 1.6259321912389348e-05, "loss": 0.0021, "num_tokens": 32187123.0, "reward": 1.8888888359069824, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 336.625, "completions/mean_terminated_length": 336.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.7129680870688065, "frac_reward_zero_std": 1.0, "grad_norm": 0.2373046875, "kl": 0.04837149614468217, "learning_rate": 1.625681051143212e-05, "loss": 0.0019, "num_tokens": 32196856.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 295.875, "completions/mean_terminated_length": 295.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7131525548791736, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.03209849586710334, "learning_rate": 1.625429846181039e-05, "loss": 0.0013, "num_tokens": 32205007.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7133370226895407, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.027820597169920802, "learning_rate": 1.6251785763784586e-05, "loss": 0.0011, "num_tokens": 32213859.0, "reward": 1.8958332538604736, "reward_std": 0.294627845287323, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7135214904999078, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04559844429604709, "learning_rate": 1.6249272417615202e-05, "loss": 0.0018, "num_tokens": 32224154.0, "reward": 1.954545497894287, "reward_std": 0.03763990476727486, "rewards/fixed_code_pass_all_test_reward/mean": 0.9545454978942871, "rewards/fixed_code_pass_all_test_reward/std": 0.037639934569597244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7137059583102748, "frac_reward_zero_std": 0.0, "grad_norm": 0.70703125, "kl": 0.041830229689367115, "learning_rate": 1.624675842356282e-05, "loss": 0.0017, "num_tokens": 32229641.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 344.125, "completions/mean_terminated_length": 344.125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7138904261206419, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.04438215680420399, "learning_rate": 1.6244243781888064e-05, "loss": 0.0018, "num_tokens": 32238914.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7140748939310091, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.052020782488398254, "learning_rate": 1.6241728492851637e-05, "loss": 0.0021, "num_tokens": 32243154.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 462.0, "completions/mean_terminated_length": 462.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7142593617413762, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.023980124155059457, "learning_rate": 1.623921255671431e-05, "loss": 0.001, "num_tokens": 32251754.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.7144438295517432, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.050589021993801, "learning_rate": 1.6236695973736916e-05, "loss": 0.002, "num_tokens": 32256542.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7146282973621103, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.047554214252159, "learning_rate": 1.6234178744180357e-05, "loss": 0.0019, "num_tokens": 32262547.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7148127651724774, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.06492248130962253, "learning_rate": 1.6231660868305603e-05, "loss": 0.0026, "num_tokens": 32268947.0, "reward": 1.6306817531585693, "reward_std": 0.11762481182813644, "rewards/fixed_code_pass_all_test_reward/mean": 0.6306818127632141, "rewards/fixed_code_pass_all_test_reward/std": 0.11762479692697525, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.7149972329828445, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0636580849532038, "learning_rate": 1.6229142346373692e-05, "loss": 0.0025, "num_tokens": 32273272.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7151817007932116, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.03331404330674559, "learning_rate": 1.622662317864573e-05, "loss": 0.0013, "num_tokens": 32281985.0, "reward": 1.9791666269302368, "reward_std": 0.058925606310367584, "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 477.125, "completions/mean_terminated_length": 477.125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.7153661686035787, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.04186164797283709, "learning_rate": 1.622410336538288e-05, "loss": 0.0017, "num_tokens": 32296170.0, "reward": 1.6782786846160889, "reward_std": 0.4630914628505707, "rewards/fixed_code_pass_all_test_reward/mean": 0.8032786846160889, "rewards/fixed_code_pass_all_test_reward/std": 0.3816539943218231, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 356.5, "completions/mean_terminated_length": 356.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.7155506364139458, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.03143377776723355, "learning_rate": 1.6221582906846387e-05, "loss": 0.0013, "num_tokens": 32305294.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 453.75, "completions/mean_terminated_length": 453.75, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7157351042243129, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.023263866896741092, "learning_rate": 1.621906180329755e-05, "loss": 0.0009, "num_tokens": 32317716.0, "reward": 1.7630208730697632, "reward_std": 0.30536240339279175, "rewards/fixed_code_pass_all_test_reward/mean": 0.7630208730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.30536243319511414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 464.5, "completions/mean_terminated_length": 464.5, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.7159195720346799, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.048990844399668276, "learning_rate": 1.6216540054997743e-05, "loss": 0.002, "num_tokens": 32326720.0, "reward": 1.8958334922790527, "reward_std": 0.03303440287709236, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.033034369349479675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 367.125, "completions/mean_terminated_length": 367.125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.716104039845047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.03421159158460796, "learning_rate": 1.6214017662208407e-05, "loss": 0.0014, "num_tokens": 32334649.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 586.0, "completions/mean_terminated_length": 586.0, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.7162885076554141, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.04030502657406032, "learning_rate": 1.6211494625191043e-05, "loss": 0.0016, "num_tokens": 32345441.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7164729754657813, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.06134243216365576, "learning_rate": 1.620897094420722e-05, "loss": 0.0025, "num_tokens": 32353616.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.7166574432761483, "frac_reward_zero_std": 1.0, "grad_norm": 2.453125, "kl": 0.10858015157282352, "learning_rate": 1.6206446619518587e-05, "loss": 0.0043, "num_tokens": 32357615.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 351.625, "completions/mean_terminated_length": 351.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.7168419110865154, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.04175499896518886, "learning_rate": 1.6203921651386836e-05, "loss": 0.0017, "num_tokens": 32365492.0, "reward": 1.46875, "reward_std": 0.37796446681022644, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7170263788968825, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.05503495829179883, "learning_rate": 1.6201396040073745e-05, "loss": 0.0022, "num_tokens": 32373418.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 307.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7172108467072495, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.06560879666358232, "learning_rate": 1.6198869785841156e-05, "loss": 0.0026, "num_tokens": 32386053.0, "reward": 1.6379311084747314, "reward_std": 0.49970269203186035, "rewards/fixed_code_pass_all_test_reward/mean": 0.6379309892654419, "rewards/fixed_code_pass_all_test_reward/std": 0.49970266222953796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 337.875, "completions/mean_terminated_length": 337.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7173953145176166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.04829346132464707, "learning_rate": 1.6196342888950966e-05, "loss": 0.0019, "num_tokens": 32395788.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 370.375, "completions/mean_terminated_length": 370.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7175797823279838, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03954493743367493, "learning_rate": 1.6193815349665157e-05, "loss": 0.0016, "num_tokens": 32406303.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7177642501383509, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05106371454894543, "learning_rate": 1.6191287168245758e-05, "loss": 0.002, "num_tokens": 32412914.0, "reward": 1.5250000953674316, "reward_std": 0.2121320515871048, "rewards/fixed_code_pass_all_test_reward/mean": 0.5250000357627869, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320515871048, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.717948717948718, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.0182959494413808, "learning_rate": 1.618875834495488e-05, "loss": 0.0007, "num_tokens": 32423601.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.718133185759085, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.05798303382471204, "learning_rate": 1.618622888005469e-05, "loss": 0.0023, "num_tokens": 32429874.0, "reward": 1.4299449920654297, "reward_std": 0.2857406735420227, "rewards/fixed_code_pass_all_test_reward/mean": 0.42994505167007446, "rewards/fixed_code_pass_all_test_reward/std": 0.2857407033443451, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7183176535694521, "frac_reward_zero_std": 1.0, "grad_norm": 0.048583984375, "kl": 0.038142868084833026, "learning_rate": 1.6183698773807434e-05, "loss": 0.0015, "num_tokens": 32439921.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 376.25, "completions/mean_terminated_length": 376.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7185021213798192, "frac_reward_zero_std": 1.0, "grad_norm": 0.058349609375, "kl": 0.030330535140819848, "learning_rate": 1.6181168026475407e-05, "loss": 0.0012, "num_tokens": 32451707.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7186865891901864, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.06698972848244011, "learning_rate": 1.617863663832099e-05, "loss": 0.0027, "num_tokens": 32456210.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7188710570005534, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.03255838970653713, "learning_rate": 1.617610460960661e-05, "loss": 0.0013, "num_tokens": 32463320.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7190555248109205, "frac_reward_zero_std": 1.0, "grad_norm": 0.055419921875, "kl": 0.0578594203107059, "learning_rate": 1.6173571940594775e-05, "loss": 0.0023, "num_tokens": 32474048.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7192399926212876, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.035128909978084266, "learning_rate": 1.6171038631548056e-05, "loss": 0.0014, "num_tokens": 32481491.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 468.0, "completions/mean_terminated_length": 468.0, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.7194244604316546, "frac_reward_zero_std": 1.0, "grad_norm": 0.038330078125, "kl": 0.018934093764983118, "learning_rate": 1.6168504682729095e-05, "loss": 0.0008, "num_tokens": 32492251.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7196089282420217, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.04249160597100854, "learning_rate": 1.6165970094400584e-05, "loss": 0.0017, "num_tokens": 32501462.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 742.5, "completions/mean_terminated_length": 742.5, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.7197933960523889, "frac_reward_zero_std": 0.0, "grad_norm": 0.625, "kl": 0.02753139554988593, "learning_rate": 1.61634348668253e-05, "loss": 0.0011, "num_tokens": 32516578.0, "reward": 1.78125, "reward_std": 0.33905068039894104, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.33905068039894104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 555.75, "completions/mean_terminated_length": 555.75, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.719977863862756, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.02451376523822546, "learning_rate": 1.616089900026608e-05, "loss": 0.001, "num_tokens": 32527312.0, "reward": 1.9196429252624512, "reward_std": 0.2272842973470688, "rewards/fixed_code_pass_all_test_reward/mean": 0.9196428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.22728432714939117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.720162331673123, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.03667365899309516, "learning_rate": 1.6158362494985817e-05, "loss": 0.0015, "num_tokens": 32532265.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 332.0, "completions/mean_terminated_length": 332.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7203467994834901, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0389222779776901, "learning_rate": 1.615582535124749e-05, "loss": 0.0016, "num_tokens": 32540553.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7205312672938572, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.050610557897016406, "learning_rate": 1.615328756931412e-05, "loss": 0.002, "num_tokens": 32551256.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 408.0, "completions/mean_terminated_length": 408.0, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7207157351042243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0291748046875, "kl": 0.021787960431538522, "learning_rate": 1.615074914944882e-05, "loss": 0.0009, "num_tokens": 32561872.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 476.125, "completions/mean_terminated_length": 476.125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.7209002029145914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0308837890625, "kl": 0.025926578673534095, "learning_rate": 1.6148210091914753e-05, "loss": 0.001, "num_tokens": 32571297.0, "reward": 1.1666666269302368, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7210846707249585, "frac_reward_zero_std": 1.0, "grad_norm": 0.166015625, "kl": 0.038321727653965354, "learning_rate": 1.614567039697515e-05, "loss": 0.0015, "num_tokens": 32581879.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.7212691385353256, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.06289628148078918, "learning_rate": 1.6143130064893305e-05, "loss": 0.0025, "num_tokens": 32588347.0, "reward": 1.5208333730697632, "reward_std": 0.4124789535999298, "rewards/fixed_code_pass_all_test_reward/mean": 0.5208333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4124789535999298, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7214536063456927, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.041084932861849666, "learning_rate": 1.614058909593259e-05, "loss": 0.0016, "num_tokens": 32600379.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 105.0, "completions/mean_terminated_length": 105.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.7216380741560597, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.04542797524482012, "learning_rate": 1.6138047490356432e-05, "loss": 0.0018, "num_tokens": 32603915.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7218225419664268, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.030650387285277247, "learning_rate": 1.6135505248428328e-05, "loss": 0.0012, "num_tokens": 32613829.0, "reward": 1.9821429252624512, "reward_std": 0.05050760135054588, "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 738.125, "completions/mean_terminated_length": 738.125, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.722007009776794, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.026643901015631855, "learning_rate": 1.6132962370411847e-05, "loss": 0.0011, "num_tokens": 32628958.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7221914775871611, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.05347546795383096, "learning_rate": 1.6130418856570606e-05, "loss": 0.0021, "num_tokens": 32637346.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7223759453975281, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.032074823742732406, "learning_rate": 1.612787470716831e-05, "loss": 0.0013, "num_tokens": 32646412.0, "reward": 1.915816307067871, "reward_std": 0.23810741305351257, "rewards/fixed_code_pass_all_test_reward/mean": 0.9158163070678711, "rewards/fixed_code_pass_all_test_reward/std": 0.23810741305351257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7225604132078952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.056091839680448174, "learning_rate": 1.6125329922468714e-05, "loss": 0.0022, "num_tokens": 32653431.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 528.625, "completions/mean_terminated_length": 528.625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.7227448810182623, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.03367272554896772, "learning_rate": 1.6122784502735647e-05, "loss": 0.0013, "num_tokens": 32663492.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 446.375, "completions/mean_terminated_length": 217.57144165039062, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7229293488286294, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.046350834832992405, "learning_rate": 1.6120238448232996e-05, "loss": 0.0019, "num_tokens": 32669847.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7231138166389965, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.08563888259232044, "learning_rate": 1.6117691759224726e-05, "loss": 0.0034, "num_tokens": 32678531.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7232982844493636, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.04110580240376294, "learning_rate": 1.611514443597486e-05, "loss": 0.0016, "num_tokens": 32685202.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7234827522597307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.04443772812373936, "learning_rate": 1.6112596478747482e-05, "loss": 0.0018, "num_tokens": 32691563.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7236672200700978, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.03798094438388944, "learning_rate": 1.611004788780675e-05, "loss": 0.0015, "num_tokens": 32702947.0, "reward": 1.8888888359069824, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 622.25, "completions/mean_terminated_length": 622.25, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.7238516878804648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0311279296875, "kl": 0.02323835517745465, "learning_rate": 1.6107498663416888e-05, "loss": 0.0009, "num_tokens": 32717909.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 334.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7240361556908319, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.043910090113058686, "learning_rate": 1.610494880584218e-05, "loss": 0.0018, "num_tokens": 32727604.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7242206235011991, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.042537749744951725, "learning_rate": 1.6102398315346976e-05, "loss": 0.0017, "num_tokens": 32732269.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.7244050913115662, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.057756332447752357, "learning_rate": 1.6099847192195696e-05, "loss": 0.0023, "num_tokens": 32738908.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 414.75, "completions/mean_terminated_length": 414.75, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.7245895591219332, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.05562214110977948, "learning_rate": 1.6097295436652825e-05, "loss": 0.0022, "num_tokens": 32748842.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 505.25, "completions/mean_terminated_length": 505.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7247740269323003, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.014650968718342483, "learning_rate": 1.6094743048982914e-05, "loss": 0.0006, "num_tokens": 32758188.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7249584947426674, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.044432720518670976, "learning_rate": 1.609219002945057e-05, "loss": 0.0018, "num_tokens": 32768159.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 391.25, "completions/mean_terminated_length": 391.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7251429625530345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.058671939419582486, "learning_rate": 1.608963637832048e-05, "loss": 0.0023, "num_tokens": 32778881.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 316.5, "completions/mean_terminated_length": 316.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7253274303634016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.044165000319480896, "learning_rate": 1.608708209585739e-05, "loss": 0.0018, "num_tokens": 32787349.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7255118981737687, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.066458644811064, "learning_rate": 1.60845271823261e-05, "loss": 0.0027, "num_tokens": 32791589.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7256963659841358, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.05199770093895495, "learning_rate": 1.60819716379915e-05, "loss": 0.0021, "num_tokens": 32798135.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7258808337945029, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.04387250286526978, "learning_rate": 1.6079415463118525e-05, "loss": 0.0018, "num_tokens": 32802428.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7260653016048699, "frac_reward_zero_std": 1.0, "grad_norm": 0.376953125, "kl": 0.04628057649824768, "learning_rate": 1.607685865797218e-05, "loss": 0.0019, "num_tokens": 32806917.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.726249769415237, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.027005181298591197, "learning_rate": 1.607430122281755e-05, "loss": 0.0011, "num_tokens": 32814898.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7264342372256042, "frac_reward_zero_std": 1.0, "grad_norm": 1.65625, "kl": 0.08047004439868033, "learning_rate": 1.6071743157919757e-05, "loss": 0.0032, "num_tokens": 32824844.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7266187050359713, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.056424317648634315, "learning_rate": 1.6069184463544013e-05, "loss": 0.0023, "num_tokens": 32835305.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 304.625, "completions/mean_terminated_length": 304.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.7268031728463383, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.03463415149599314, "learning_rate": 1.6066625139955584e-05, "loss": 0.0014, "num_tokens": 32845406.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7269876406567054, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.041150896809995174, "learning_rate": 1.6064065187419807e-05, "loss": 0.0016, "num_tokens": 32854311.0, "reward": 1.2666666507720947, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2666666805744171, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7271721084670725, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.04893456865102053, "learning_rate": 1.6061504606202073e-05, "loss": 0.002, "num_tokens": 32862011.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7273565762774395, "frac_reward_zero_std": 1.0, "grad_norm": 0.05517578125, "kl": 0.026307484367862344, "learning_rate": 1.6058943396567857e-05, "loss": 0.0011, "num_tokens": 32868080.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7275410440878067, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.02838913816958666, "learning_rate": 1.605638155878268e-05, "loss": 0.0011, "num_tokens": 32872239.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7277255118981738, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.08450439060106874, "learning_rate": 1.605381909311214e-05, "loss": 0.0034, "num_tokens": 32876867.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 392.0, "completions/mean_terminated_length": 392.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.7279099797085409, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.029690427938476205, "learning_rate": 1.6051255999821892e-05, "loss": 0.0012, "num_tokens": 32886107.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.728094447518908, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.03432905371300876, "learning_rate": 1.6048692279177664e-05, "loss": 0.0014, "num_tokens": 32897332.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.728278915329275, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.02945005102083087, "learning_rate": 1.6046127931445245e-05, "loss": 0.0012, "num_tokens": 32902932.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7284633831396421, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.04274723259732127, "learning_rate": 1.604356295689049e-05, "loss": 0.0017, "num_tokens": 32907480.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7286478509500092, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04971313290297985, "learning_rate": 1.6040997355779316e-05, "loss": 0.002, "num_tokens": 32917500.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 556.5, "completions/mean_terminated_length": 556.5, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.7288323187603764, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.028119852184318006, "learning_rate": 1.6038431128377713e-05, "loss": 0.0011, "num_tokens": 32931240.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 321.875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7290167865707434, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.02718433376867324, "learning_rate": 1.6035864274951728e-05, "loss": 0.0011, "num_tokens": 32938223.0, "reward": 1.7058823108673096, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.7292012543811105, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.06950789829716086, "learning_rate": 1.603329679576747e-05, "loss": 0.0028, "num_tokens": 32946657.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7293857221914776, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03142592078074813, "learning_rate": 1.6030728691091124e-05, "loss": 0.0013, "num_tokens": 32951876.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7295701900018446, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.027054836857132614, "learning_rate": 1.6028159961188934e-05, "loss": 0.0011, "num_tokens": 32958155.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 546.75, "completions/mean_terminated_length": 546.75, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.7297546578122117, "frac_reward_zero_std": 0.0, "grad_norm": 0.6953125, "kl": 0.025944420252926648, "learning_rate": 1.6025590606327208e-05, "loss": 0.001, "num_tokens": 32968433.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 506.5, "completions/mean_terminated_length": 506.5, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.7299391256225789, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541015625, "kl": 0.028234106488525867, "learning_rate": 1.6023020626772313e-05, "loss": 0.0011, "num_tokens": 32981045.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 379.25, "completions/mean_terminated_length": 379.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.730123593432946, "frac_reward_zero_std": 1.0, "grad_norm": 8.1875, "kl": 0.245445808628574, "learning_rate": 1.6020450022790695e-05, "loss": 0.0098, "num_tokens": 32993719.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.730308061243313, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.053407154977321625, "learning_rate": 1.6017878794648856e-05, "loss": 0.0021, "num_tokens": 33001502.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7304925290536801, "frac_reward_zero_std": 1.0, "grad_norm": 0.057861328125, "kl": 0.0243281185394153, "learning_rate": 1.6015306942613363e-05, "loss": 0.001, "num_tokens": 33005396.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7306769968640472, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.09135144017636776, "learning_rate": 1.6012734466950852e-05, "loss": 0.0037, "num_tokens": 33009673.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 3961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 363.625, "completions/mean_terminated_length": 363.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7308614646744143, "frac_reward_zero_std": 1.0, "grad_norm": 0.054443359375, "kl": 0.03955088322982192, "learning_rate": 1.6010161367928017e-05, "loss": 0.0016, "num_tokens": 33016646.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.7310459324847814, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.04728124290704727, "learning_rate": 1.6007587645811614e-05, "loss": 0.0019, "num_tokens": 33020725.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 402.75, "completions/mean_terminated_length": 402.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7312304002951485, "frac_reward_zero_std": 1.0, "grad_norm": 0.2001953125, "kl": 0.02095779002411291, "learning_rate": 1.600501330086848e-05, "loss": 0.0008, "num_tokens": 33028123.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7314148681055156, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.09113500593230128, "learning_rate": 1.60024383333655e-05, "loss": 0.0036, "num_tokens": 33036711.0, "reward": 1.0714285373687744, "reward_std": 0.07636039704084396, "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, "rewards/fixed_code_pass_all_test_reward/std": 0.07636035978794098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7315993359158827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.05243783490732312, "learning_rate": 1.5999862743569626e-05, "loss": 0.0021, "num_tokens": 33045330.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7317838037262497, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.045014011673629284, "learning_rate": 1.599728653174789e-05, "loss": 0.0018, "num_tokens": 33049503.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.7319682715366168, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0528943941462785, "learning_rate": 1.5994709698167363e-05, "loss": 0.0021, "num_tokens": 33058313.0, "reward": 1.514423131942749, "reward_std": 0.5204757452011108, "rewards/fixed_code_pass_all_test_reward/mean": 0.5144230723381042, "rewards/fixed_code_pass_all_test_reward/std": 0.5204757452011108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.732152739346984, "frac_reward_zero_std": 1.0, "grad_norm": 0.05078125, "kl": 0.040799153968691826, "learning_rate": 1.5992132243095203e-05, "loss": 0.0016, "num_tokens": 33067803.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7323372071573511, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0640311362221837, "learning_rate": 1.598955416679862e-05, "loss": 0.0026, "num_tokens": 33075818.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 493.25, "completions/mean_terminated_length": 493.25, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.7325216749677181, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03732651798054576, "learning_rate": 1.598697546954489e-05, "loss": 0.0015, "num_tokens": 33090156.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.7327061427780852, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.027838158421218395, "learning_rate": 1.598439615160136e-05, "loss": 0.0011, "num_tokens": 33103361.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 627.75, "completions/mean_terminated_length": 627.75, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.7328906105884523, "frac_reward_zero_std": 0.0, "grad_norm": 0.6484375, "kl": 0.03472438082098961, "learning_rate": 1.5981816213235432e-05, "loss": 0.0014, "num_tokens": 33123719.0, "reward": 1.5201612710952759, "reward_std": 0.3997313976287842, "rewards/fixed_code_pass_all_test_reward/mean": 0.5201612710952759, "rewards/fixed_code_pass_all_test_reward/std": 0.39973142743110657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 136.125, "completions/mean_terminated_length": 136.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7330750783988194, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.01801547675859183, "learning_rate": 1.597923565471458e-05, "loss": 0.0007, "num_tokens": 33127512.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 509.625, "completions/mean_terminated_length": 509.625, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.7332595462091865, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.028727317927405238, "learning_rate": 1.5976654476306338e-05, "loss": 0.0011, "num_tokens": 33137493.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7334440140195536, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.04967854171991348, "learning_rate": 1.5974072678278306e-05, "loss": 0.002, "num_tokens": 33144358.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 551.5, "completions/mean_terminated_length": 551.5, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7336284818299207, "frac_reward_zero_std": 0.0, "grad_norm": 0.71484375, "kl": 0.040022993460297585, "learning_rate": 1.5971490260898142e-05, "loss": 0.0016, "num_tokens": 33158002.0, "reward": 1.4953703880310059, "reward_std": 0.6347249150276184, "rewards/fixed_code_pass_all_test_reward/mean": 0.6203703880310059, "rewards/fixed_code_pass_all_test_reward/std": 0.31721773743629456, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7338129496402878, "frac_reward_zero_std": 1.0, "grad_norm": 0.388671875, "kl": 0.06516158045269549, "learning_rate": 1.5968907224433585e-05, "loss": 0.0026, "num_tokens": 33162151.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7339974174506548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0615234375, "kl": 0.04679653234779835, "learning_rate": 1.596632356915242e-05, "loss": 0.0019, "num_tokens": 33168908.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 402.5, "completions/mean_terminated_length": 402.5, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.7341818852610219, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.035484156222082675, "learning_rate": 1.5963739295322504e-05, "loss": 0.0014, "num_tokens": 33180096.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7343663530713891, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.054780554957687855, "learning_rate": 1.5961154403211755e-05, "loss": 0.0022, "num_tokens": 33188270.0, "reward": 1.892045497894287, "reward_std": 0.30534157156944275, "rewards/fixed_code_pass_all_test_reward/mean": 0.8920454382896423, "rewards/fixed_code_pass_all_test_reward/std": 0.30534157156944275, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7345508208817562, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06549424352124333, "learning_rate": 1.595856889308816e-05, "loss": 0.0026, "num_tokens": 33194864.0, "reward": 1.9431818723678589, "reward_std": 0.047049880027770996, "rewards/fixed_code_pass_all_test_reward/mean": 0.9431818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.047049909830093384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7347352886921232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.07016842905431986, "learning_rate": 1.5955982765219768e-05, "loss": 0.0028, "num_tokens": 33198566.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.7349197565024903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.027329481905326247, "learning_rate": 1.595339601987469e-05, "loss": 0.0011, "num_tokens": 33208670.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7351042243128574, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.048960990039631724, "learning_rate": 1.595080865732111e-05, "loss": 0.002, "num_tokens": 33217337.0, "reward": 1.8799999952316284, "reward_std": 0.24657657742500305, "rewards/fixed_code_pass_all_test_reward/mean": 0.8799999952316284, "rewards/fixed_code_pass_all_test_reward/std": 0.24657656252384186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7352886921232245, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.053261857479810715, "learning_rate": 1.5948220677827253e-05, "loss": 0.0021, "num_tokens": 33223218.0, "reward": 1.9736841917037964, "reward_std": 0.048727408051490784, "rewards/fixed_code_pass_all_test_reward/mean": 0.9736841917037964, "rewards/fixed_code_pass_all_test_reward/std": 0.048727381974458694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7354731599335916, "frac_reward_zero_std": 1.0, "grad_norm": 0.5859375, "kl": 0.09905620105564594, "learning_rate": 1.5945632081661436e-05, "loss": 0.004, "num_tokens": 33227330.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7356576277439587, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.028455681866034865, "learning_rate": 1.5943042869092024e-05, "loss": 0.0011, "num_tokens": 33234024.0, "reward": 1.6881721019744873, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6881720423698425, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7358420955543258, "frac_reward_zero_std": 1.0, "grad_norm": 0.047607421875, "kl": 0.015687916544266045, "learning_rate": 1.5940453040387448e-05, "loss": 0.0006, "num_tokens": 33239623.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7360265633646929, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.027939641615375876, "learning_rate": 1.593786259581621e-05, "loss": 0.0011, "num_tokens": 33243829.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 441.25, "completions/mean_terminated_length": 441.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7362110311750599, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.028994532534852624, "learning_rate": 1.5935271535646858e-05, "loss": 0.0012, "num_tokens": 33250391.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.736395498985427, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.041998119675554335, "learning_rate": 1.593267986014803e-05, "loss": 0.0017, "num_tokens": 33255400.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7365799667957942, "frac_reward_zero_std": 1.0, "grad_norm": 0.047607421875, "kl": 0.025974841555580497, "learning_rate": 1.5930087569588403e-05, "loss": 0.001, "num_tokens": 33265276.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7367644346061613, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.13659921940416098, "learning_rate": 1.5927494664236735e-05, "loss": 0.0055, "num_tokens": 33274598.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 3994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 435.375, "completions/mean_terminated_length": 435.375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7369489024165283, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.050163556821644306, "learning_rate": 1.592490114436184e-05, "loss": 0.002, "num_tokens": 33287065.0, "reward": 1.9147727489471436, "reward_std": 0.24105912446975708, "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.24105913937091827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 287.0, "completions/mean_terminated_length": 287.0, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.7371333702268954, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.0366561864502728, "learning_rate": 1.5922307010232593e-05, "loss": 0.0015, "num_tokens": 33293201.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 432.625, "completions/mean_terminated_length": 432.625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7373178380372625, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.018781858030706644, "learning_rate": 1.591971226211794e-05, "loss": 0.0008, "num_tokens": 33301302.0, "reward": 1.9318182468414307, "reward_std": 0.042082689702510834, "rewards/fixed_code_pass_all_test_reward/mean": 0.9318182468414307, "rewards/fixed_code_pass_all_test_reward/std": 0.04208271950483322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 561.125, "completions/mean_terminated_length": 561.125, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.7375023058476295, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.04009233810938895, "learning_rate": 1.5917116900286885e-05, "loss": 0.0016, "num_tokens": 33311311.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 499.125, "completions/mean_terminated_length": 499.125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.7376867736579967, "frac_reward_zero_std": 1.0, "grad_norm": 0.047119140625, "kl": 0.03048451093491167, "learning_rate": 1.59145209250085e-05, "loss": 0.0012, "num_tokens": 33325704.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 3999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7378712414683638, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04478382761590183, "learning_rate": 1.5911924336551918e-05, "loss": 0.0018, "num_tokens": 33333549.0, "reward": 1.795454502105713, "reward_std": 0.37874454259872437, "rewards/fixed_code_pass_all_test_reward/mean": 0.7954545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.37874457240104675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.7380557092787309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0478515625, "kl": 0.03336609178222716, "learning_rate": 1.5909327135186337e-05, "loss": 0.0013, "num_tokens": 33338958.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.738240177089098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.0220868110191077, "learning_rate": 1.5906729321181017e-05, "loss": 0.0009, "num_tokens": 33343702.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.738424644899465, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.11390826385468245, "learning_rate": 1.5904130894805278e-05, "loss": 0.0046, "num_tokens": 33347754.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 438.125, "completions/mean_terminated_length": 438.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7386091127098321, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "kl": 0.017839289736002684, "learning_rate": 1.5901531856328512e-05, "loss": 0.0007, "num_tokens": 33359747.0, "reward": 1.841346263885498, "reward_std": 0.21896308660507202, "rewards/fixed_code_pass_all_test_reward/mean": 0.8413461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.2189631313085556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.7387935805201993, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.054490976966917515, "learning_rate": 1.5898932206020173e-05, "loss": 0.0022, "num_tokens": 33366466.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 415.75, "completions/mean_terminated_length": 415.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7389780483305664, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.06692877889145166, "learning_rate": 1.5896331944149768e-05, "loss": 0.0027, "num_tokens": 33376192.0, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7391625161409334, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.02966914139688015, "learning_rate": 1.5893731070986875e-05, "loss": 0.0012, "num_tokens": 33382005.0, "reward": 1.5384615659713745, "reward_std": 0.4934053421020508, "rewards/fixed_code_pass_all_test_reward/mean": 0.5384615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.49340540170669556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 743.625, "completions/mean_terminated_length": 743.625, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.7393469839513005, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.03560088959056884, "learning_rate": 1.5891129586801143e-05, "loss": 0.0014, "num_tokens": 33396050.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.7395314517616676, "frac_reward_zero_std": 1.0, "grad_norm": 0.03857421875, "kl": 0.0306621128693223, "learning_rate": 1.588852749186227e-05, "loss": 0.0012, "num_tokens": 33401785.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7397159195720346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.04600727395154536, "learning_rate": 1.588592478644003e-05, "loss": 0.0018, "num_tokens": 33408785.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 499.875, "completions/mean_terminated_length": 499.875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.7399003873824018, "frac_reward_zero_std": 0.0, "grad_norm": 0.77734375, "kl": 0.031372192897833884, "learning_rate": 1.5883321470804248e-05, "loss": 0.0013, "num_tokens": 33420552.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 304.75, "completions/mean_terminated_length": 304.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7400848551927689, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0462660975754261, "learning_rate": 1.5880717545224817e-05, "loss": 0.0019, "num_tokens": 33428598.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.740269323003136, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.051337195094674826, "learning_rate": 1.5878113009971704e-05, "loss": 0.0021, "num_tokens": 33437051.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.740453790813503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.03495763521641493, "learning_rate": 1.587550786531492e-05, "loss": 0.0014, "num_tokens": 33446138.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 456.75, "completions/mean_terminated_length": 456.75, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7406382586238701, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.048813389614224434, "learning_rate": 1.5872902111524556e-05, "loss": 0.002, "num_tokens": 33456632.0, "reward": 1.7857142686843872, "reward_std": 0.3295460343360901, "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.3295460343360901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7408227264342372, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.0294568482786417, "learning_rate": 1.5870295748870756e-05, "loss": 0.0012, "num_tokens": 33461476.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7410071942446043, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.05165248294360936, "learning_rate": 1.5867688777623728e-05, "loss": 0.0021, "num_tokens": 33466741.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 362.125, "completions/mean_terminated_length": 362.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7411916620549714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.04054518206976354, "learning_rate": 1.586508119805375e-05, "loss": 0.0016, "num_tokens": 33473342.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.7413761298653385, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.038770116632804275, "learning_rate": 1.586247301043116e-05, "loss": 0.0016, "num_tokens": 33481156.0, "reward": 1.4134615659713745, "reward_std": 0.27254632115364075, "rewards/fixed_code_pass_all_test_reward/mean": 0.4134615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.27254632115364075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.7415605976757056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.03849170613102615, "learning_rate": 1.585986421502635e-05, "loss": 0.0015, "num_tokens": 33485068.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7417450654860727, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.036965793929994106, "learning_rate": 1.5857254812109788e-05, "loss": 0.0015, "num_tokens": 33495320.0, "reward": 1.9821429252624512, "reward_std": 0.05050760135054588, "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 390.5, "completions/mean_terminated_length": 390.5, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.7419295332964397, "frac_reward_zero_std": 1.0, "grad_norm": 0.042724609375, "kl": 0.03305220138281584, "learning_rate": 1.5854644801952003e-05, "loss": 0.0013, "num_tokens": 33505788.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 506.25, "completions/mean_terminated_length": 506.25, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.7421140011068068, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.02479924855288118, "learning_rate": 1.585203418482357e-05, "loss": 0.001, "num_tokens": 33515262.0, "reward": 1.8909574747085571, "reward_std": 0.044684480875730515, "rewards/fixed_code_pass_all_test_reward/mean": 0.8909574747085571, "rewards/fixed_code_pass_all_test_reward/std": 0.04468446597456932, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 408.75, "completions/mean_terminated_length": 408.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.742298468917174, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.046957595739513636, "learning_rate": 1.5849422960995157e-05, "loss": 0.0019, "num_tokens": 33523068.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.7424829367275411, "frac_reward_zero_std": 1.0, "grad_norm": 1.2578125, "kl": 0.168346063233912, "learning_rate": 1.584681113073747e-05, "loss": 0.0067, "num_tokens": 33526663.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7426674045379081, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.07877808157354593, "learning_rate": 1.5844198694321283e-05, "loss": 0.0032, "num_tokens": 33530983.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 478.0, "completions/mean_terminated_length": 478.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7428518723482752, "frac_reward_zero_std": 0.0, "grad_norm": 0.56640625, "kl": 0.023095287615433335, "learning_rate": 1.5841585652017445e-05, "loss": 0.0009, "num_tokens": 33542303.0, "reward": 1.9525315761566162, "reward_std": 0.13426080346107483, "rewards/fixed_code_pass_all_test_reward/mean": 0.952531635761261, "rewards/fixed_code_pass_all_test_reward/std": 0.13426078855991364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7430363401586423, "frac_reward_zero_std": 1.0, "grad_norm": 0.04638671875, "kl": 0.026699120178818703, "learning_rate": 1.583897200409685e-05, "loss": 0.0011, "num_tokens": 33553039.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7432208079690094, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.025468763895332813, "learning_rate": 1.5836357750830467e-05, "loss": 0.001, "num_tokens": 33562429.0, "reward": 1.7472221851348877, "reward_std": 0.10213766247034073, "rewards/fixed_code_pass_all_test_reward/mean": 0.7472221851348877, "rewards/fixed_code_pass_all_test_reward/std": 0.10213764011859894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7434052757793765, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.03593021538108587, "learning_rate": 1.5833742892489328e-05, "loss": 0.0014, "num_tokens": 33572309.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7435897435897436, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.03660622111056, "learning_rate": 1.5831127429344516e-05, "loss": 0.0015, "num_tokens": 33580282.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.7437742114001107, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.02688920369837433, "learning_rate": 1.5828511361667194e-05, "loss": 0.0011, "num_tokens": 33584409.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.7439586792104778, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.06510383123531938, "learning_rate": 1.5825894689728575e-05, "loss": 0.0026, "num_tokens": 33588129.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7441431470208448, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.06580345064867288, "learning_rate": 1.5823277413799937e-05, "loss": 0.0026, "num_tokens": 33594390.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7443276148312119, "frac_reward_zero_std": 1.0, "grad_norm": 0.3125, "kl": 0.058121299371123314, "learning_rate": 1.582065953415262e-05, "loss": 0.0023, "num_tokens": 33598492.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7445120826415791, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.05001427954994142, "learning_rate": 1.5818041051058034e-05, "loss": 0.002, "num_tokens": 33602343.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7446965504519462, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.032082814374007285, "learning_rate": 1.581542196478764e-05, "loss": 0.0013, "num_tokens": 33608777.0, "reward": 1.3499999046325684, "reward_std": 0.1414213478565216, "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 384.0, "completions/mean_terminated_length": 384.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7448810182623132, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.06394024123437703, "learning_rate": 1.5812802275612972e-05, "loss": 0.0026, "num_tokens": 33616113.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 430.25, "completions/mean_terminated_length": 430.25, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.7450654860726803, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.05865313624963164, "learning_rate": 1.581018198380562e-05, "loss": 0.0023, "num_tokens": 33632035.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 456.75, "completions/mean_terminated_length": 456.75, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7452499538830474, "frac_reward_zero_std": 1.0, "grad_norm": 0.044921875, "kl": 0.019989887019619346, "learning_rate": 1.5807561089637232e-05, "loss": 0.0008, "num_tokens": 33639705.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7454344216934145, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.027710677939467132, "learning_rate": 1.5804939593379534e-05, "loss": 0.0011, "num_tokens": 33646186.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7456188895037816, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.06420320738106966, "learning_rate": 1.5802317495304304e-05, "loss": 0.0026, "num_tokens": 33656594.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7458033573141487, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.05376526340842247, "learning_rate": 1.579969479568338e-05, "loss": 0.0022, "num_tokens": 33660877.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 374.0, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7459878251245158, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.05379949533380568, "learning_rate": 1.579707149478867e-05, "loss": 0.0022, "num_tokens": 33671309.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 422.375, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7461722929348829, "frac_reward_zero_std": 0.0, "grad_norm": 0.91796875, "kl": 0.08552798861637712, "learning_rate": 1.579444759289214e-05, "loss": 0.0034, "num_tokens": 33679888.0, "reward": 1.6953125, "reward_std": 0.3696606457233429, "rewards/fixed_code_pass_all_test_reward/mean": 0.6953125, "rewards/fixed_code_pass_all_test_reward/std": 0.3696606457233429, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7463567607452499, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.03387106506852433, "learning_rate": 1.5791823090265817e-05, "loss": 0.0014, "num_tokens": 33684883.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.746541228555617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.028350495267659426, "learning_rate": 1.5789197987181792e-05, "loss": 0.0011, "num_tokens": 33688867.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7467256963659842, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.028165154508315027, "learning_rate": 1.578657228391222e-05, "loss": 0.0011, "num_tokens": 33697628.0, "reward": 1.65234375, "reward_std": 0.29725655913352966, "rewards/fixed_code_pass_all_test_reward/mean": 0.77734375, "rewards/fixed_code_pass_all_test_reward/std": 0.06823570281267166, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 458.25, "completions/mean_terminated_length": 458.25, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.7469101641763513, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.05670161754824221, "learning_rate": 1.578394598072931e-05, "loss": 0.0023, "num_tokens": 33708694.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7470946319867183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.04341883957386017, "learning_rate": 1.5781319077905347e-05, "loss": 0.0017, "num_tokens": 33716899.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 483.375, "completions/mean_terminated_length": 483.375, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.7472790997970854, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.033641395973972976, "learning_rate": 1.577869157571267e-05, "loss": 0.0013, "num_tokens": 33728206.0, "reward": 1.1875, "reward_std": 0.3282996118068695, "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, "rewards/fixed_code_pass_all_test_reward/std": 0.3282995820045471, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7474635676074525, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.05429615546017885, "learning_rate": 1.5776063474423677e-05, "loss": 0.0022, "num_tokens": 33736679.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7476480354178195, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.0604194737970829, "learning_rate": 1.5773434774310835e-05, "loss": 0.0024, "num_tokens": 33740919.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 487.75, "completions/mean_terminated_length": 487.75, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7478325032281867, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.03337793389800936, "learning_rate": 1.577080547564667e-05, "loss": 0.0013, "num_tokens": 33749141.0, "reward": 1.529761791229248, "reward_std": 0.13810105621814728, "rewards/fixed_code_pass_all_test_reward/mean": 0.5297619104385376, "rewards/fixed_code_pass_all_test_reward/std": 0.13810110092163086, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7480169710385538, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.06465550232678652, "learning_rate": 1.5768175578703773e-05, "loss": 0.0026, "num_tokens": 33754750.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 377.625, "completions/mean_terminated_length": 377.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7482014388489209, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.051934156101197004, "learning_rate": 1.5765545083754784e-05, "loss": 0.0021, "num_tokens": 33766027.0, "reward": 1.0925925970077515, "reward_std": 0.058560676872730255, "rewards/fixed_code_pass_all_test_reward/mean": 0.09259258955717087, "rewards/fixed_code_pass_all_test_reward/std": 0.058560699224472046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.748385906659288, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.04148395196534693, "learning_rate": 1.576291399107243e-05, "loss": 0.0017, "num_tokens": 33775114.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.748570374469655, "frac_reward_zero_std": 1.0, "grad_norm": 0.039306640625, "kl": 0.01985733606852591, "learning_rate": 1.5760282300929474e-05, "loss": 0.0008, "num_tokens": 33781324.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7487548422800221, "frac_reward_zero_std": 0.0, "grad_norm": 6.84375, "kl": 0.04690723423846066, "learning_rate": 1.5757650013598755e-05, "loss": 0.0019, "num_tokens": 33786455.0, "reward": 1.337499976158142, "reward_std": 0.5316752195358276, "rewards/fixed_code_pass_all_test_reward/mean": 0.4625000059604645, "rewards/fixed_code_pass_all_test_reward/std": 0.2875388264656067, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 566.0, "completions/mean_terminated_length": 566.0, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.7489393100903893, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.02554392023012042, "learning_rate": 1.575501712935317e-05, "loss": 0.001, "num_tokens": 33796311.0, "reward": 1.8208333253860474, "reward_std": 0.09417565912008286, "rewards/fixed_code_pass_all_test_reward/mean": 0.8208333253860474, "rewards/fixed_code_pass_all_test_reward/std": 0.09417562186717987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 426.625, "completions/mean_terminated_length": 426.625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7491237779007563, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.03370983060449362, "learning_rate": 1.5752383648465682e-05, "loss": 0.0013, "num_tokens": 33804868.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 555.25, "completions/mean_terminated_length": 555.25, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.7493082457111234, "frac_reward_zero_std": 0.0, "grad_norm": 0.671875, "kl": 0.027929088100790977, "learning_rate": 1.5749749571209313e-05, "loss": 0.0011, "num_tokens": 33814678.0, "reward": 1.6688311100006104, "reward_std": 0.19902664422988892, "rewards/fixed_code_pass_all_test_reward/mean": 0.6688311696052551, "rewards/fixed_code_pass_all_test_reward/std": 0.19902662932872772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7494927135214905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.032012793235480785, "learning_rate": 1.5747114897857145e-05, "loss": 0.0013, "num_tokens": 33824770.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 576.125, "completions/mean_terminated_length": 576.125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.7496771813318576, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.031022879295051098, "learning_rate": 1.5744479628682325e-05, "loss": 0.0012, "num_tokens": 33837291.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7498616491422246, "frac_reward_zero_std": 1.0, "grad_norm": 0.04541015625, "kl": 0.01937434310093522, "learning_rate": 1.5741843763958053e-05, "loss": 0.0008, "num_tokens": 33842021.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7500461169525918, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.027074200217612088, "learning_rate": 1.5739207303957606e-05, "loss": 0.0011, "num_tokens": 33848144.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 422.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7502305847629589, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.04087367851752788, "learning_rate": 1.5736570248954312e-05, "loss": 0.0016, "num_tokens": 33856221.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 344.875, "completions/mean_terminated_length": 344.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.750415052573326, "frac_reward_zero_std": 1.0, "grad_norm": 0.2578125, "kl": 0.06644891854375601, "learning_rate": 1.5733932599221566e-05, "loss": 0.0027, "num_tokens": 33867116.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.750599520383693, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.023051332333125174, "learning_rate": 1.5731294355032813e-05, "loss": 0.0009, "num_tokens": 33871462.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7507839881940601, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.05594278662465513, "learning_rate": 1.5728655516661584e-05, "loss": 0.0022, "num_tokens": 33880405.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 384.75, "completions/mean_terminated_length": 384.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7509684560044272, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.02845495706424117, "learning_rate": 1.5726016084381438e-05, "loss": 0.0011, "num_tokens": 33891835.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 823.5, "completions/mean_terminated_length": 823.5, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.7511529238147944, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.043188205221667886, "learning_rate": 1.572337605846603e-05, "loss": 0.0017, "num_tokens": 33910263.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 330.75, "completions/mean_terminated_length": 330.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7513373916251614, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.08610758441500366, "learning_rate": 1.572073543918905e-05, "loss": 0.0034, "num_tokens": 33920469.0, "reward": 1.221153974533081, "reward_std": 0.02719642035663128, "rewards/fixed_code_pass_all_test_reward/mean": 0.2211538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7515218594355285, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.07740178680978715, "learning_rate": 1.5718094226824264e-05, "loss": 0.0031, "num_tokens": 33930485.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7517063272458956, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.07915871846489608, "learning_rate": 1.5715452421645494e-05, "loss": 0.0032, "num_tokens": 33935968.0, "reward": 1.9500000476837158, "reward_std": 0.09258202463388443, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.7518907950562627, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.057063727639615536, "learning_rate": 1.5712810023926628e-05, "loss": 0.0023, "num_tokens": 33940202.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 385.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7520752628666297, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.03737764130346477, "learning_rate": 1.5710167033941607e-05, "loss": 0.0015, "num_tokens": 33947864.0, "reward": 1.8430231809616089, "reward_std": 0.12661686539649963, "rewards/fixed_code_pass_all_test_reward/mean": 0.8430233001708984, "rewards/fixed_code_pass_all_test_reward/std": 0.12661688029766083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 405.875, "completions/mean_terminated_length": 405.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7522597306769969, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.06536904047243297, "learning_rate": 1.5707523451964442e-05, "loss": 0.0026, "num_tokens": 33959583.0, "reward": 1.3641304969787598, "reward_std": 0.6139689683914185, "rewards/fixed_code_pass_all_test_reward/mean": 0.489130437374115, "rewards/fixed_code_pass_all_test_reward/std": 0.3349721133708954, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 557.875, "completions/mean_terminated_length": 557.875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.752444198487364, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.03108978015370667, "learning_rate": 1.5704879278269197e-05, "loss": 0.0012, "num_tokens": 33969558.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 462.5, "completions/mean_terminated_length": 462.5, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.7526286662977311, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.028140490758232772, "learning_rate": 1.570223451313001e-05, "loss": 0.0011, "num_tokens": 33982458.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7528131341080981, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.0488034060690552, "learning_rate": 1.5699589156821074e-05, "loss": 0.002, "num_tokens": 33989015.0, "reward": 1.5957446098327637, "reward_std": 0.23637956380844116, "rewards/fixed_code_pass_all_test_reward/mean": 0.5957446694374084, "rewards/fixed_code_pass_all_test_reward/std": 0.23637959361076355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7529976019184652, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.04167483979836106, "learning_rate": 1.5696943209616632e-05, "loss": 0.0017, "num_tokens": 34000532.0, "reward": 1.9950000047683716, "reward_std": 0.014142122119665146, "rewards/fixed_code_pass_all_test_reward/mean": 0.9950000047683716, "rewards/fixed_code_pass_all_test_reward/std": 0.014142143540084362, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 619.25, "completions/mean_terminated_length": 619.25, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.7531820697288323, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.03904522303491831, "learning_rate": 1.5694296671791004e-05, "loss": 0.0016, "num_tokens": 34012558.0, "reward": 1.89453125, "reward_std": 0.29831066727638245, "rewards/fixed_code_pass_all_test_reward/mean": 0.89453125, "rewards/fixed_code_pass_all_test_reward/std": 0.29831069707870483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7533665375391994, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.02607435453683138, "learning_rate": 1.5691649543618564e-05, "loss": 0.001, "num_tokens": 34018855.0, "reward": 1.8229167461395264, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 449.375, "completions/mean_terminated_length": 449.375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7535510053495665, "frac_reward_zero_std": 0.0, "grad_norm": 0.734375, "kl": 0.03199944703374058, "learning_rate": 1.568900182537375e-05, "loss": 0.0013, "num_tokens": 34031522.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7537354731599336, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0555715961381793, "learning_rate": 1.5686353517331062e-05, "loss": 0.0022, "num_tokens": 34042726.0, "reward": 1.65625, "reward_std": 0.47442004084587097, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.47442010045051575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7539199409703007, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.04379092110320926, "learning_rate": 1.568370461976505e-05, "loss": 0.0018, "num_tokens": 34053104.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7541044087806678, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.027351115364581347, "learning_rate": 1.5681055132950347e-05, "loss": 0.0011, "num_tokens": 34062485.0, "reward": 1.7272727489471436, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7542888765910348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0478515625, "kl": 0.03654542029835284, "learning_rate": 1.5678405057161625e-05, "loss": 0.0015, "num_tokens": 34076454.0, "reward": 1.0952380895614624, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.095238097012043, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7544733444014019, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05040164804086089, "learning_rate": 1.5675754392673626e-05, "loss": 0.002, "num_tokens": 34084083.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 530.5, "completions/mean_terminated_length": 530.5, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.7546578122117691, "frac_reward_zero_std": 1.0, "grad_norm": 0.056396484375, "kl": 0.0252474487060681, "learning_rate": 1.567310313976116e-05, "loss": 0.001, "num_tokens": 34094151.0, "reward": 1.2083333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333283662796, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7548422800221362, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0407392093911767, "learning_rate": 1.5670451298699085e-05, "loss": 0.0016, "num_tokens": 34099904.0, "reward": 1.600961446762085, "reward_std": 0.40544557571411133, "rewards/fixed_code_pass_all_test_reward/mean": 0.6009615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.40544557571411133, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 448.25, "completions/mean_terminated_length": 448.25, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.7550267478325032, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.032078812131658196, "learning_rate": 1.5667798869762328e-05, "loss": 0.0013, "num_tokens": 34111778.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 420.625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.7552112156428703, "frac_reward_zero_std": 1.0, "grad_norm": 0.031494140625, "kl": 0.021950013004243374, "learning_rate": 1.5665145853225876e-05, "loss": 0.0009, "num_tokens": 34121383.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 366.75, "completions/mean_terminated_length": 366.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7553956834532374, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.051481425296515226, "learning_rate": 1.5662492249364778e-05, "loss": 0.0021, "num_tokens": 34130477.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 521.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7555801512636044, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.02077665668912232, "learning_rate": 1.5659838058454136e-05, "loss": 0.0008, "num_tokens": 34142309.0, "reward": 1.633802890777588, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6338028311729431, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 525.875, "completions/mean_terminated_length": 525.875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.7557646190739716, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.03582437802106142, "learning_rate": 1.5657183280769128e-05, "loss": 0.0014, "num_tokens": 34155484.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7559490868843387, "frac_reward_zero_std": 1.0, "grad_norm": 0.056884765625, "kl": 0.02201786037767306, "learning_rate": 1.5654527916584972e-05, "loss": 0.0009, "num_tokens": 34159244.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7561335546947058, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.03945993003435433, "learning_rate": 1.565187196617697e-05, "loss": 0.0016, "num_tokens": 34166394.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 429.375, "completions/mean_terminated_length": 429.375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.7563180225050729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0419921875, "kl": 0.03478241455741227, "learning_rate": 1.5649215429820466e-05, "loss": 0.0014, "num_tokens": 34174605.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7565024903154399, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.08544080378487706, "learning_rate": 1.5646558307790877e-05, "loss": 0.0034, "num_tokens": 34180298.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 545.5, "completions/mean_terminated_length": 545.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.756686958125807, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.04805665370076895, "learning_rate": 1.5643900600363666e-05, "loss": 0.0019, "num_tokens": 34189270.0, "reward": 1.4375, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 367.5, "completions/mean_terminated_length": 367.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7568714259361742, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.0419107002671808, "learning_rate": 1.5641242307814382e-05, "loss": 0.0017, "num_tokens": 34200802.0, "reward": 1.7727272510528564, "reward_std": 0.3401506841182709, "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.3401506841182709, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7570558937465413, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06570312357507646, "learning_rate": 1.5638583430418603e-05, "loss": 0.0026, "num_tokens": 34210828.0, "reward": 1.1964285373687744, "reward_std": 0.3813242018222809, "rewards/fixed_code_pass_all_test_reward/mean": 0.1964285671710968, "rewards/fixed_code_pass_all_test_reward/std": 0.38132426142692566, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7572403615569083, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.041695571737363935, "learning_rate": 1.5635923968451996e-05, "loss": 0.0017, "num_tokens": 34220394.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7574248293672754, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.04562750272452831, "learning_rate": 1.5633263922190266e-05, "loss": 0.0018, "num_tokens": 34229480.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 397.5, "completions/mean_terminated_length": 397.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7576092971776425, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03588966163806617, "learning_rate": 1.56306032919092e-05, "loss": 0.0014, "num_tokens": 34240516.0, "reward": 1.6477272510528564, "reward_std": 0.32979726791381836, "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, "rewards/fixed_code_pass_all_test_reward/std": 0.32979726791381836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7577937649880095, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.037955476436764, "learning_rate": 1.5627942077884627e-05, "loss": 0.0015, "num_tokens": 34247368.0, "reward": 1.1538461446762085, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1538461595773697, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.7579782327983767, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04663580749183893, "learning_rate": 1.562528028039245e-05, "loss": 0.0019, "num_tokens": 34255789.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7581627006087438, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.04126089462079108, "learning_rate": 1.5622617899708613e-05, "loss": 0.0017, "num_tokens": 34262914.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7583471684191109, "frac_reward_zero_std": 1.0, "grad_norm": 0.287109375, "kl": 0.05121637345291674, "learning_rate": 1.5619954936109148e-05, "loss": 0.002, "num_tokens": 34271169.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.758531636229478, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.041525262175127864, "learning_rate": 1.561729138987013e-05, "loss": 0.0017, "num_tokens": 34282634.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.758716104039845, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.05537239124532789, "learning_rate": 1.5614627261267697e-05, "loss": 0.0022, "num_tokens": 34291928.0, "reward": 1.8790322542190552, "reward_std": 0.34214845299720764, "rewards/fixed_code_pass_all_test_reward/mean": 0.8790322542190552, "rewards/fixed_code_pass_all_test_reward/std": 0.34214845299720764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7589005718502121, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.038832023506984115, "learning_rate": 1.5611962550578045e-05, "loss": 0.0016, "num_tokens": 34301137.0, "reward": 1.943750023841858, "reward_std": 0.1590990275144577, "rewards/fixed_code_pass_all_test_reward/mean": 0.9437500238418579, "rewards/fixed_code_pass_all_test_reward/std": 0.1590990275144577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7590850396605793, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.03992340853437781, "learning_rate": 1.5609297258077436e-05, "loss": 0.0016, "num_tokens": 34310791.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7592695074709463, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.038001936743967235, "learning_rate": 1.560663138404219e-05, "loss": 0.0015, "num_tokens": 34316603.0, "reward": 1.9605263471603394, "reward_std": 0.11164842545986176, "rewards/fixed_code_pass_all_test_reward/mean": 0.9605263471603394, "rewards/fixed_code_pass_all_test_reward/std": 0.11164844036102295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 325.25, "completions/mean_terminated_length": 325.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7594539752813134, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.02253741107415408, "learning_rate": 1.5603964928748685e-05, "loss": 0.0009, "num_tokens": 34322909.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 421.5, "completions/mean_terminated_length": 421.5, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7596384430916805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458984375, "kl": 0.018645445816218853, "learning_rate": 1.560129789247337e-05, "loss": 0.0007, "num_tokens": 34330889.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7598229109020476, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.06693549687042832, "learning_rate": 1.559863027549273e-05, "loss": 0.0027, "num_tokens": 34339862.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7600073787124146, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03041716292500496, "learning_rate": 1.559596207808334e-05, "loss": 0.0012, "num_tokens": 34348384.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.7601918465227818, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.04703114437870681, "learning_rate": 1.5593293300521814e-05, "loss": 0.0019, "num_tokens": 34361658.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7603763143331489, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.03171178151387721, "learning_rate": 1.5590623943084836e-05, "loss": 0.0013, "num_tokens": 34368736.0, "reward": 1.6617646217346191, "reward_std": 0.20876337587833405, "rewards/fixed_code_pass_all_test_reward/mean": 0.6617647409439087, "rewards/fixed_code_pass_all_test_reward/std": 0.20876334607601166, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.760560782143516, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04495143215171993, "learning_rate": 1.5587954006049145e-05, "loss": 0.0018, "num_tokens": 34380285.0, "reward": 1.670454502105713, "reward_std": 0.052952997386455536, "rewards/fixed_code_pass_all_test_reward/mean": 0.6704545617103577, "rewards/fixed_code_pass_all_test_reward/std": 0.05295296385884285, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 400.25, "completions/mean_terminated_length": 400.25, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.760745249953883, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.02235506847500801, "learning_rate": 1.5585283489691544e-05, "loss": 0.0009, "num_tokens": 34388119.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7609297177642501, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.04645890276879072, "learning_rate": 1.5582612394288892e-05, "loss": 0.0019, "num_tokens": 34398559.0, "reward": 1.9272727966308594, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7611141855746172, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.03720030828844756, "learning_rate": 1.557994072011811e-05, "loss": 0.0015, "num_tokens": 34410123.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7612986533849844, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.022905010846443474, "learning_rate": 1.5577268467456182e-05, "loss": 0.0009, "num_tokens": 34416998.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 392.875, "completions/mean_terminated_length": 392.875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.7614831211953514, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.03299618302844465, "learning_rate": 1.5574595636580156e-05, "loss": 0.0013, "num_tokens": 34424901.0, "reward": 1.454545497894287, "reward_std": 0.09718587249517441, "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.0971859022974968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 508.125, "completions/mean_terminated_length": 508.125, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.7616675890057185, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.04822696594055742, "learning_rate": 1.5571922227767118e-05, "loss": 0.0019, "num_tokens": 34434334.0, "reward": 1.0833332538604736, "reward_std": 0.0690065547823906, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.06900656223297119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7618520568160856, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.055022924207150936, "learning_rate": 1.556924824129424e-05, "loss": 0.0022, "num_tokens": 34439580.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 398.875, "completions/mean_terminated_length": 398.875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.7620365246264527, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.058222068939357996, "learning_rate": 1.556657367743874e-05, "loss": 0.0023, "num_tokens": 34453099.0, "reward": 1.5431034564971924, "reward_std": 0.48844295740127563, "rewards/fixed_code_pass_all_test_reward/mean": 0.5431034564971924, "rewards/fixed_code_pass_all_test_reward/std": 0.48844295740127563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 422.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.7622209924368197, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.022955544875003397, "learning_rate": 1.5563898536477902e-05, "loss": 0.0009, "num_tokens": 34465096.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7624054602471869, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.04030996223445982, "learning_rate": 1.556122281868906e-05, "loss": 0.0016, "num_tokens": 34473361.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.762589928057554, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.03349179634824395, "learning_rate": 1.5558546524349623e-05, "loss": 0.0013, "num_tokens": 34481443.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7627743958679211, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.058142867404967546, "learning_rate": 1.5555869653737044e-05, "loss": 0.0023, "num_tokens": 34487594.0, "reward": 1.78125, "reward_std": 0.405046284198761, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.405046284198761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 675.875, "completions/mean_terminated_length": 675.875, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.7629588636782881, "frac_reward_zero_std": 0.0, "grad_norm": 0.640625, "kl": 0.022991442354395986, "learning_rate": 1.555319220712885e-05, "loss": 0.0009, "num_tokens": 34506161.0, "reward": 1.6145833730697632, "reward_std": 0.5097025036811829, "rewards/fixed_code_pass_all_test_reward/mean": 0.6145833134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.5097025036811829, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 783.125, "completions/mean_terminated_length": 783.125, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.7631433314886552, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109375, "kl": 0.04164518998004496, "learning_rate": 1.5550514184802614e-05, "loss": 0.0017, "num_tokens": 34522898.0, "reward": 1.4916666746139526, "reward_std": 0.7497618794441223, "rewards/fixed_code_pass_all_test_reward/mean": 0.6166666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.5108349919319153, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 389.5, "completions/mean_terminated_length": 389.5, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7633277992990223, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.05158032616600394, "learning_rate": 1.554783558703598e-05, "loss": 0.0021, "num_tokens": 34536174.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7635122671093895, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.0462767337448895, "learning_rate": 1.5545156414106647e-05, "loss": 0.0019, "num_tokens": 34544239.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7636967349197565, "frac_reward_zero_std": 0.0, "grad_norm": 0.83984375, "kl": 0.03616289282217622, "learning_rate": 1.554247666629237e-05, "loss": 0.0014, "num_tokens": 34553622.0, "reward": 1.6477272510528564, "reward_std": 0.22498849034309387, "rewards/fixed_code_pass_all_test_reward/mean": 0.6477273106575012, "rewards/fixed_code_pass_all_test_reward/std": 0.22498854994773865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 438.0, "completions/mean_terminated_length": 438.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7638812027301236, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.041621406679041684, "learning_rate": 1.553979634387097e-05, "loss": 0.0017, "num_tokens": 34562422.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7640656705404907, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.04784901230596006, "learning_rate": 1.5537115447120333e-05, "loss": 0.0019, "num_tokens": 34566783.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7642501383508578, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.04853527818340808, "learning_rate": 1.5534433976318383e-05, "loss": 0.0019, "num_tokens": 34573394.0, "reward": 1.796875, "reward_std": 0.3892385959625244, "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7644346061612248, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.037617075140587986, "learning_rate": 1.5531751931743122e-05, "loss": 0.0015, "num_tokens": 34583146.0, "reward": 1.5, "reward_std": 0.076360322535038, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.07636036723852158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.764619073971592, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.03491357038728893, "learning_rate": 1.552906931367261e-05, "loss": 0.0014, "num_tokens": 34592347.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7648035417819591, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.02737523231189698, "learning_rate": 1.552638612238496e-05, "loss": 0.0011, "num_tokens": 34602815.0, "reward": 1.50632905960083, "reward_std": 0.41851532459259033, "rewards/fixed_code_pass_all_test_reward/mean": 0.6313291192054749, "rewards/fixed_code_pass_all_test_reward/std": 0.3943217396736145, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 419.125, "completions/mean_terminated_length": 419.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7649880095923262, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.03803575085476041, "learning_rate": 1.5523702358158348e-05, "loss": 0.0015, "num_tokens": 34610928.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7651724774026932, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.04329600534401834, "learning_rate": 1.552101802127101e-05, "loss": 0.0017, "num_tokens": 34615358.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7653569452130603, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.054386631469242275, "learning_rate": 1.5518333112001232e-05, "loss": 0.0022, "num_tokens": 34621119.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 767.75, "completions/mean_terminated_length": 767.75, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.7655414130234274, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.034190513426437974, "learning_rate": 1.5515647630627382e-05, "loss": 0.0014, "num_tokens": 34633893.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 297.125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7657258808337944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.05819693300873041, "learning_rate": 1.5512961577427865e-05, "loss": 0.0023, "num_tokens": 34641790.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 338.625, "completions/mean_terminated_length": 338.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7659103486441616, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.024057555594481528, "learning_rate": 1.551027495268115e-05, "loss": 0.001, "num_tokens": 34652523.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7660948164545287, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.06564589589834213, "learning_rate": 1.5507587756665775e-05, "loss": 0.0026, "num_tokens": 34660207.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 342.875, "completions/mean_terminated_length": 342.875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7662792842648958, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.040337187703698874, "learning_rate": 1.5504899989660324e-05, "loss": 0.0016, "num_tokens": 34667582.0, "reward": 1.6538461446762085, "reward_std": 0.483016699552536, "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.4830167293548584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7664637520752628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.052766723558306694, "learning_rate": 1.5502211651943454e-05, "loss": 0.0021, "num_tokens": 34676033.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7666482198856299, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.04932376532815397, "learning_rate": 1.5499522743793868e-05, "loss": 0.002, "num_tokens": 34685117.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.766832687695997, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.04520510137081146, "learning_rate": 1.5496833265490335e-05, "loss": 0.0018, "num_tokens": 34695044.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7670171555063642, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.0413076535332948, "learning_rate": 1.5494143217311685e-05, "loss": 0.0017, "num_tokens": 34705952.0, "reward": 1.9038461446762085, "reward_std": 0.2719641625881195, "rewards/fixed_code_pass_all_test_reward/mean": 0.9038461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7672016233167313, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.05754320742562413, "learning_rate": 1.5491452599536806e-05, "loss": 0.0023, "num_tokens": 34715981.0, "reward": 1.5336538553237915, "reward_std": 0.5020825266838074, "rewards/fixed_code_pass_all_test_reward/mean": 0.5336538553237915, "rewards/fixed_code_pass_all_test_reward/std": 0.5020825266838074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 456.625, "completions/mean_terminated_length": 456.625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.7673860911270983, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.025910448050126433, "learning_rate": 1.5488761412444638e-05, "loss": 0.001, "num_tokens": 34723954.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.7675705589374654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.040328719245735556, "learning_rate": 1.5486069656314185e-05, "loss": 0.0016, "num_tokens": 34728341.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 426.0, "completions/mean_terminated_length": 426.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7677550267478325, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.03692259616218507, "learning_rate": 1.5483377331424516e-05, "loss": 0.0015, "num_tokens": 34737645.0, "reward": 1.2397959232330322, "reward_std": 0.025582974776625633, "rewards/fixed_code_pass_all_test_reward/mean": 0.23979592323303223, "rewards/fixed_code_pass_all_test_reward/std": 0.025582989677786827, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7679394945581995, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.03559138206765056, "learning_rate": 1.548068443805475e-05, "loss": 0.0014, "num_tokens": 34747778.0, "reward": 1.7699275016784668, "reward_std": 0.3413332402706146, "rewards/fixed_code_pass_all_test_reward/mean": 0.7699275016784668, "rewards/fixed_code_pass_all_test_reward/std": 0.341333270072937, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 81.0, "completions/mean_terminated_length": 81.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.7681239623685667, "frac_reward_zero_std": 0.0, "grad_norm": 4.84375, "kl": 0.0358170090476051, "learning_rate": 1.5477990976484067e-05, "loss": 0.0014, "num_tokens": 34751106.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7683084301789338, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.042712693102657795, "learning_rate": 1.5475296946991712e-05, "loss": 0.0017, "num_tokens": 34757085.0, "reward": 1.4310344457626343, "reward_std": 0.04876599833369255, "rewards/fixed_code_pass_all_test_reward/mean": 0.43103450536727905, "rewards/fixed_code_pass_all_test_reward/std": 0.04876597970724106, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7684928979893009, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.09636898338794708, "learning_rate": 1.5472602349856978e-05, "loss": 0.0039, "num_tokens": 34761382.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 287.125, "completions/mean_terminated_length": 287.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7686773657996679, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.028883957187645137, "learning_rate": 1.5469907185359227e-05, "loss": 0.0012, "num_tokens": 34767767.0, "reward": 1.9444444179534912, "reward_std": 0.15713484585285187, "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, "rewards/fixed_code_pass_all_test_reward/std": 0.15713483095169067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.768861833610035, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.03723579877987504, "learning_rate": 1.5467211453777874e-05, "loss": 0.0015, "num_tokens": 34777496.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 399.75, "completions/mean_terminated_length": 399.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7690463014204021, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.04571031464729458, "learning_rate": 1.5464515155392397e-05, "loss": 0.0018, "num_tokens": 34786038.0, "reward": 1.0071427822113037, "reward_std": 0.5345770120620728, "rewards/fixed_code_pass_all_test_reward/mean": 0.13214287161827087, "rewards/fixed_code_pass_all_test_reward/std": 0.3507384657859802, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.03686756337992847, "learning_rate": 1.5461818290482323e-05, "loss": 0.0015, "num_tokens": 34790578.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7694152370411363, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.04737623827531934, "learning_rate": 1.5459120859327255e-05, "loss": 0.0019, "num_tokens": 34800676.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7695997048515034, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.02956281427759677, "learning_rate": 1.545642286220684e-05, "loss": 0.0012, "num_tokens": 34805008.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7697841726618705, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.06869720760732889, "learning_rate": 1.5453724299400786e-05, "loss": 0.0027, "num_tokens": 34812972.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.7699686404722376, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.03602675092406571, "learning_rate": 1.5451025171188865e-05, "loss": 0.0014, "num_tokens": 34823508.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7701531082826046, "frac_reward_zero_std": 1.0, "grad_norm": 0.1806640625, "kl": 0.06415445217862725, "learning_rate": 1.5448325477850907e-05, "loss": 0.0026, "num_tokens": 34833015.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.7703375760929718, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.05697881802916527, "learning_rate": 1.5445625219666792e-05, "loss": 0.0023, "num_tokens": 34842373.0, "reward": 1.5790441036224365, "reward_std": 0.3211272060871124, "rewards/fixed_code_pass_all_test_reward/mean": 0.5790441036224365, "rewards/fixed_code_pass_all_test_reward/std": 0.3211272060871124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7705220439033389, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.047597558004781604, "learning_rate": 1.544292439691647e-05, "loss": 0.0019, "num_tokens": 34851748.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.770706511713706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.029382308362983167, "learning_rate": 1.5440223009879946e-05, "loss": 0.0012, "num_tokens": 34855755.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 313.375, "completions/mean_terminated_length": 313.375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.770890979524073, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.039363088784739375, "learning_rate": 1.5437521058837273e-05, "loss": 0.0016, "num_tokens": 34864270.0, "reward": 1.3125, "reward_std": 0.7039429545402527, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 442.875, "completions/mean_terminated_length": 442.875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.7710754473344401, "frac_reward_zero_std": 0.0, "grad_norm": 0.76953125, "kl": 0.0333313561277464, "learning_rate": 1.543481854406858e-05, "loss": 0.0013, "num_tokens": 34873229.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7712599151448072, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.05146278068423271, "learning_rate": 1.5432115465854042e-05, "loss": 0.0021, "num_tokens": 34880069.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7714443829551744, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.03505857149139047, "learning_rate": 1.5429411824473897e-05, "loss": 0.0014, "num_tokens": 34886469.0, "reward": 1.9642857313156128, "reward_std": 0.06612997502088547, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.06613000482320786, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7716288507655414, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.03198063187301159, "learning_rate": 1.5426707620208435e-05, "loss": 0.0013, "num_tokens": 34897758.0, "reward": 1.530172348022461, "reward_std": 0.7348925471305847, "rewards/fixed_code_pass_all_test_reward/mean": 0.6551724076271057, "rewards/fixed_code_pass_all_test_reward/std": 0.4773625433444977, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7718133185759085, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.037183623760938644, "learning_rate": 1.5424002853338022e-05, "loss": 0.0015, "num_tokens": 34903552.0, "reward": 1.5208333730697632, "reward_std": 0.4027435779571533, "rewards/fixed_code_pass_all_test_reward/mean": 0.5208333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4027435779571533, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 456.375, "completions/mean_terminated_length": 456.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.7719977863862756, "frac_reward_zero_std": 0.0, "grad_norm": 0.84375, "kl": 0.03460608725436032, "learning_rate": 1.5421297524143062e-05, "loss": 0.0014, "num_tokens": 34916003.0, "reward": 1.48046875, "reward_std": 0.28702935576438904, "rewards/fixed_code_pass_all_test_reward/mean": 0.48046875, "rewards/fixed_code_pass_all_test_reward/std": 0.28702935576438904, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7721822541966427, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.06287805223837495, "learning_rate": 1.5418591632904025e-05, "loss": 0.0025, "num_tokens": 34925267.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7723667220070097, "frac_reward_zero_std": 1.0, "grad_norm": 0.291015625, "kl": 0.06577357917558402, "learning_rate": 1.541588517990144e-05, "loss": 0.0026, "num_tokens": 34929491.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7725511898173769, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.06512370379641652, "learning_rate": 1.5413178165415902e-05, "loss": 0.0026, "num_tokens": 34938546.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 313.375, "completions/mean_terminated_length": 313.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.772735657627744, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.03373398771509528, "learning_rate": 1.5410470589728043e-05, "loss": 0.0013, "num_tokens": 34949333.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7729201254381111, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.05231419624760747, "learning_rate": 1.540776245311858e-05, "loss": 0.0021, "num_tokens": 34956625.0, "reward": 1.9090908765792847, "reward_std": 0.16833095252513885, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7731045932484781, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.049576778430491686, "learning_rate": 1.5405053755868262e-05, "loss": 0.002, "num_tokens": 34962601.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7732890610588452, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.07637656712904572, "learning_rate": 1.540234449825792e-05, "loss": 0.0031, "num_tokens": 34970607.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 508.25, "completions/mean_terminated_length": 508.25, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.7734735288692123, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.03211791848298162, "learning_rate": 1.5399634680568426e-05, "loss": 0.0013, "num_tokens": 34983273.0, "reward": 1.375, "reward_std": 0.6025738716125488, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7736579966795795, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.08807623595930636, "learning_rate": 1.5396924303080715e-05, "loss": 0.0035, "num_tokens": 34990540.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.7738424644899465, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.038086773827672005, "learning_rate": 1.539421336607578e-05, "loss": 0.0015, "num_tokens": 34996040.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7740269323003136, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.1252927128225565, "learning_rate": 1.5391501869834677e-05, "loss": 0.005, "num_tokens": 34999830.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 336.25, "completions/mean_terminated_length": 336.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7742114001106807, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.04273383517283946, "learning_rate": 1.5388789814638515e-05, "loss": 0.0017, "num_tokens": 35009504.0, "reward": 1.8648648262023926, "reward_std": 0.1444655805826187, "rewards/fixed_code_pass_all_test_reward/mean": 0.8648648262023926, "rewards/fixed_code_pass_all_test_reward/std": 0.14446555078029633, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 488.0, "completions/mean_terminated_length": 488.0, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.7743958679210478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517578125, "kl": 0.025885277660563588, "learning_rate": 1.538607720076846e-05, "loss": 0.001, "num_tokens": 35021480.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7745803357314148, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.03954869415611029, "learning_rate": 1.538336402850574e-05, "loss": 0.0016, "num_tokens": 35027336.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.774764803541782, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.05817483353894204, "learning_rate": 1.538065029813164e-05, "loss": 0.0023, "num_tokens": 35031710.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.7749492713521491, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.048816447611898184, "learning_rate": 1.5377936009927493e-05, "loss": 0.002, "num_tokens": 35041147.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 413.125, "completions/mean_terminated_length": 413.125, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7751337391625162, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.03886149381287396, "learning_rate": 1.537522116417471e-05, "loss": 0.0016, "num_tokens": 35049812.0, "reward": 1.014285683631897, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.014285714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 281.5, "completions/mean_terminated_length": 281.5, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.7753182069728832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.038354938733391464, "learning_rate": 1.537250576115474e-05, "loss": 0.0015, "num_tokens": 35056448.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.7755026747832503, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.08601234992966056, "learning_rate": 1.53697898011491e-05, "loss": 0.0034, "num_tokens": 35059926.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7756871425936174, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.060583160258829594, "learning_rate": 1.5367073284439366e-05, "loss": 0.0024, "num_tokens": 35066660.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 615.5, "completions/mean_terminated_length": 615.5, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.7758716104039846, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.02036650711670518, "learning_rate": 1.5364356211307162e-05, "loss": 0.0008, "num_tokens": 35081448.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 457.875, "completions/mean_terminated_length": 457.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7760560782143516, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.05927315982989967, "learning_rate": 1.536163858203418e-05, "loss": 0.0024, "num_tokens": 35096855.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 375.75, "completions/mean_terminated_length": 375.75, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7762405460247187, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.044437556294724345, "learning_rate": 1.5358920396902167e-05, "loss": 0.0018, "num_tokens": 35107229.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7764250138350858, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.04613933269865811, "learning_rate": 1.5356201656192924e-05, "loss": 0.0018, "num_tokens": 35115585.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.7766094816454528, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.10815837560221553, "learning_rate": 1.535348236018831e-05, "loss": 0.0043, "num_tokens": 35122811.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7767939494558199, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.0454112661536783, "learning_rate": 1.5350762509170253e-05, "loss": 0.0018, "num_tokens": 35128936.0, "reward": 1.8958332538604736, "reward_std": 0.294627845287323, "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7769784172661871, "frac_reward_zero_std": 1.0, "grad_norm": 0.04150390625, "kl": 0.031617205357179046, "learning_rate": 1.5348042103420718e-05, "loss": 0.0013, "num_tokens": 35133927.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7771628850765542, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.03597757639363408, "learning_rate": 1.534532114322174e-05, "loss": 0.0014, "num_tokens": 35138870.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7773473528869213, "frac_reward_zero_std": 1.0, "grad_norm": 0.060302734375, "kl": 0.04311914648860693, "learning_rate": 1.5342599628855415e-05, "loss": 0.0017, "num_tokens": 35145737.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 423.125, "completions/mean_terminated_length": 423.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7775318206972883, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.08446195302531123, "learning_rate": 1.5339877560603893e-05, "loss": 0.0034, "num_tokens": 35155098.0, "reward": 1.125, "reward_std": 0.6408699750900269, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 400.5, "completions/mean_terminated_length": 400.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7777162885076554, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.02498171158367768, "learning_rate": 1.5337154938749374e-05, "loss": 0.001, "num_tokens": 35163438.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7779007563180225, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.05193942366167903, "learning_rate": 1.5334431763574122e-05, "loss": 0.0021, "num_tokens": 35167923.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7780852241283895, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.03880469175055623, "learning_rate": 1.533170803536046e-05, "loss": 0.0016, "num_tokens": 35172893.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7782696919387567, "frac_reward_zero_std": 1.0, "grad_norm": 0.345703125, "kl": 0.07256455300375819, "learning_rate": 1.5328983754390764e-05, "loss": 0.0029, "num_tokens": 35180254.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 389.25, "completions/mean_terminated_length": 389.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.7784541597491238, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.04087743326090276, "learning_rate": 1.5326258920947476e-05, "loss": 0.0016, "num_tokens": 35188152.0, "reward": 1.8875000476837158, "reward_std": 0.1552647203207016, "rewards/fixed_code_pass_all_test_reward/mean": 0.8875000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.15526476502418518, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7786386275594909, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.043054357171058655, "learning_rate": 1.5323533535313076e-05, "loss": 0.0017, "num_tokens": 35201186.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.7788230953698579, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.050755386939272285, "learning_rate": 1.5320807597770124e-05, "loss": 0.002, "num_tokens": 35211007.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 676.375, "completions/mean_terminated_length": 676.375, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.779007563180225, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.0350166589487344, "learning_rate": 1.5318081108601228e-05, "loss": 0.0014, "num_tokens": 35225594.0, "reward": 1.46875, "reward_std": 0.45193037390708923, "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, "rewards/fixed_code_pass_all_test_reward/std": 0.4519304037094116, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7791920309905921, "frac_reward_zero_std": 1.0, "grad_norm": 0.65625, "kl": 0.08683659369125962, "learning_rate": 1.5315354068089045e-05, "loss": 0.0035, "num_tokens": 35232353.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7793764988009593, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.04753512376919389, "learning_rate": 1.53126264765163e-05, "loss": 0.0019, "num_tokens": 35242006.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 483.75, "completions/mean_terminated_length": 483.75, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7795609666113263, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.046729941852390766, "learning_rate": 1.5309898334165772e-05, "loss": 0.0019, "num_tokens": 35251084.0, "reward": 0.9147726893424988, "reward_std": 0.36962398886680603, "rewards/fixed_code_pass_all_test_reward/mean": 0.039772726595401764, "rewards/fixed_code_pass_all_test_reward/std": 0.016070609912276268, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 319.0, "completions/mean_terminated_length": 319.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7797454344216934, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.05022623646073043, "learning_rate": 1.5307169641320296e-05, "loss": 0.002, "num_tokens": 35258148.0, "reward": 1.8928571939468384, "reward_std": 0.14787116646766663, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571939468384, "rewards/fixed_code_pass_all_test_reward/std": 0.14787118136882782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.7799299022320605, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.06938020884990692, "learning_rate": 1.5304440398262764e-05, "loss": 0.0028, "num_tokens": 35262056.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.7801143700424276, "frac_reward_zero_std": 1.0, "grad_norm": 0.048583984375, "kl": 0.029115087701939046, "learning_rate": 1.5301710605276127e-05, "loss": 0.0012, "num_tokens": 35266749.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7802988378527946, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.035598845686763525, "learning_rate": 1.529898026264339e-05, "loss": 0.0014, "num_tokens": 35274119.0, "reward": 1.959302306175232, "reward_std": 0.03881504014134407, "rewards/fixed_code_pass_all_test_reward/mean": 0.9593023061752319, "rewards/fixed_code_pass_all_test_reward/std": 0.03881501778960228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 488.625, "completions/mean_terminated_length": 488.625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.7804833056631618, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.0507223941385746, "learning_rate": 1.5296249370647623e-05, "loss": 0.002, "num_tokens": 35282956.0, "reward": 1.9666666984558105, "reward_std": 0.09428088366985321, "rewards/fixed_code_pass_all_test_reward/mean": 0.9666666984558105, "rewards/fixed_code_pass_all_test_reward/std": 0.0942808985710144, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 494.125, "completions/mean_terminated_length": 494.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7806677734735289, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.024287003790959716, "learning_rate": 1.529351792957194e-05, "loss": 0.001, "num_tokens": 35295517.0, "reward": 1.9719101190567017, "reward_std": 0.07945017516613007, "rewards/fixed_code_pass_all_test_reward/mean": 0.9719101190567017, "rewards/fixed_code_pass_all_test_reward/std": 0.07945020496845245, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.780852241283896, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.06769803026691079, "learning_rate": 1.529078593969952e-05, "loss": 0.0027, "num_tokens": 35306184.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.781036709094263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.05453792284242809, "learning_rate": 1.5288053401313597e-05, "loss": 0.0022, "num_tokens": 35310453.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.7812211769046301, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.08278459077700973, "learning_rate": 1.5285320314697465e-05, "loss": 0.0033, "num_tokens": 35320485.0, "reward": 1.375, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7814056447149972, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.039522164734080434, "learning_rate": 1.528258668013447e-05, "loss": 0.0016, "num_tokens": 35334358.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 341.875, "completions/mean_terminated_length": 341.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7815901125253644, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.06758507061749697, "learning_rate": 1.5279852497908018e-05, "loss": 0.0027, "num_tokens": 35347005.0, "reward": 1.4166667461395264, "reward_std": 0.21257823705673218, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.21257825195789337, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 468.625, "completions/mean_terminated_length": 468.625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.7817745803357314, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.03683425486087799, "learning_rate": 1.5277117768301566e-05, "loss": 0.0015, "num_tokens": 35359466.0, "reward": 1.84375, "reward_std": 0.2893187701702118, "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.7819590481460985, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.16712904628366232, "learning_rate": 1.5274382491598646e-05, "loss": 0.0067, "num_tokens": 35363374.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.7821435159564656, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.07661651633679867, "learning_rate": 1.527164666808282e-05, "loss": 0.0031, "num_tokens": 35372700.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 235.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.7823279837668327, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.041999108041636646, "learning_rate": 1.5268910298037724e-05, "loss": 0.0017, "num_tokens": 35377437.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7825124515771997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576171875, "kl": 0.04805940203368664, "learning_rate": 1.526617338174705e-05, "loss": 0.0019, "num_tokens": 35386625.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7826969193875669, "frac_reward_zero_std": 1.0, "grad_norm": 0.041748046875, "kl": 0.0276016442803666, "learning_rate": 1.5263435919494538e-05, "loss": 0.0011, "num_tokens": 35393444.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.782881387197934, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.056785646826028824, "learning_rate": 1.5260697911563993e-05, "loss": 0.0023, "num_tokens": 35401684.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7830658550083011, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.058073720196262, "learning_rate": 1.525795935823927e-05, "loss": 0.0023, "num_tokens": 35407796.0, "reward": 1.90625, "reward_std": 0.1293872892856598, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 475.875, "completions/mean_terminated_length": 475.875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.7832503228186681, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.04560146410949528, "learning_rate": 1.525522025980429e-05, "loss": 0.0018, "num_tokens": 35420091.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 362.375, "completions/mean_terminated_length": 362.375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7834347906290352, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.08364721166435629, "learning_rate": 1.5252480616543021e-05, "loss": 0.0033, "num_tokens": 35431286.0, "reward": 1.0125000476837158, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 447.125, "completions/mean_terminated_length": 447.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.7836192584394023, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.041779750026762486, "learning_rate": 1.5249740428739487e-05, "loss": 0.0017, "num_tokens": 35443807.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7838037262497695, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04858891956973821, "learning_rate": 1.5246999696677783e-05, "loss": 0.0019, "num_tokens": 35448213.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.7839881940601365, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.04711187840439379, "learning_rate": 1.5244258420642041e-05, "loss": 0.0019, "num_tokens": 35453621.0, "reward": 1.7999999523162842, "reward_std": 0.12344267219305038, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.12344267219305038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 488.125, "completions/mean_terminated_length": 488.125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7841726618705036, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.040148946922272444, "learning_rate": 1.5241516600916462e-05, "loss": 0.0016, "num_tokens": 35466790.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7843571296808707, "frac_reward_zero_std": 1.0, "grad_norm": 0.89453125, "kl": 0.06099623767659068, "learning_rate": 1.5238774237785297e-05, "loss": 0.0024, "num_tokens": 35472362.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7845415974912378, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.17344183754175901, "learning_rate": 1.5236031331532857e-05, "loss": 0.0069, "num_tokens": 35480673.0, "reward": 1.6749999523162842, "reward_std": 0.20059439539909363, "rewards/fixed_code_pass_all_test_reward/mean": 0.6749999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.20059436559677124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 441.625, "completions/mean_terminated_length": 441.625, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.7847260653016048, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.043307977030053735, "learning_rate": 1.523328788244351e-05, "loss": 0.0017, "num_tokens": 35489318.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.784910533111972, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.026094762375578284, "learning_rate": 1.5230543890801676e-05, "loss": 0.001, "num_tokens": 35493894.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 308.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.7850950009223391, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.054978877771645784, "learning_rate": 1.5227799356891838e-05, "loss": 0.0022, "num_tokens": 35501882.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 397.875, "completions/mean_terminated_length": 397.875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.7852794687327062, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.04716405947692692, "learning_rate": 1.5225054280998523e-05, "loss": 0.0019, "num_tokens": 35512113.0, "reward": 1.9090909957885742, "reward_std": 0.08416546136140823, "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.08416546136140823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7854639365430732, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.06488224258646369, "learning_rate": 1.5222308663406333e-05, "loss": 0.0026, "num_tokens": 35520475.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.7856484043534403, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.05268768919631839, "learning_rate": 1.5219562504399907e-05, "loss": 0.0021, "num_tokens": 35529621.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7858328721638074, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.05193520220927894, "learning_rate": 1.521681580426395e-05, "loss": 0.0021, "num_tokens": 35536204.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 588.125, "completions/mean_terminated_length": 588.125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.7860173399741746, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.024990473873913288, "learning_rate": 1.5214068563283223e-05, "loss": 0.001, "num_tokens": 35547085.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7862018077845416, "frac_reward_zero_std": 1.0, "grad_norm": 3.15625, "kl": 0.2598787338938564, "learning_rate": 1.5211320781742544e-05, "loss": 0.0104, "num_tokens": 35554943.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7863862755949087, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.09348477283492684, "learning_rate": 1.5208572459926783e-05, "loss": 0.0037, "num_tokens": 35560378.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7865707434052758, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.040949560003355145, "learning_rate": 1.5205823598120863e-05, "loss": 0.0016, "num_tokens": 35566846.0, "reward": 1.7434210777282715, "reward_std": 0.32740238308906555, "rewards/fixed_code_pass_all_test_reward/mean": 0.7434210777282715, "rewards/fixed_code_pass_all_test_reward/std": 0.32740238308906555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 394.75, "completions/mean_terminated_length": 394.75, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.7867552112156428, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.03829534165561199, "learning_rate": 1.5203074196609775e-05, "loss": 0.0015, "num_tokens": 35577364.0, "reward": 1.9894737005233765, "reward_std": 0.01949092000722885, "rewards/fixed_code_pass_all_test_reward/mean": 0.9894737005233765, "rewards/fixed_code_pass_all_test_reward/std": 0.01949094794690609, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 407.25, "completions/mean_terminated_length": 407.25, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.7869396790260099, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.04996078601107001, "learning_rate": 1.5200324255678553e-05, "loss": 0.002, "num_tokens": 35585598.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 696.5, "completions/mean_terminated_length": 696.5, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.7871241468363771, "frac_reward_zero_std": 1.0, "grad_norm": 0.046875, "kl": 0.02940192143432796, "learning_rate": 1.5197573775612297e-05, "loss": 0.0012, "num_tokens": 35603002.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.7873086146467442, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.05472077685408294, "learning_rate": 1.5194822756696154e-05, "loss": 0.0022, "num_tokens": 35607331.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 452.0, "completions/mean_terminated_length": 452.0, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.7874930824571112, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.04624458588659763, "learning_rate": 1.5192071199215334e-05, "loss": 0.0018, "num_tokens": 35619691.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7876775502674783, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.03861832758411765, "learning_rate": 1.5189319103455104e-05, "loss": 0.0015, "num_tokens": 35628794.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.7878620180778454, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.044964129803702235, "learning_rate": 1.5186566469700776e-05, "loss": 0.0018, "num_tokens": 35633016.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7880464858882125, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.07516616676002741, "learning_rate": 1.5183813298237727e-05, "loss": 0.003, "num_tokens": 35642894.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7882309536985797, "frac_reward_zero_std": 1.0, "grad_norm": 0.048828125, "kl": 0.031358262058347464, "learning_rate": 1.5181059589351388e-05, "loss": 0.0013, "num_tokens": 35651743.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 99.0, "completions/mean_terminated_length": 99.0, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.7884154215089467, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.06317651877179742, "learning_rate": 1.5178305343327247e-05, "loss": 0.0025, "num_tokens": 35655247.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7885998893193138, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.028904934064485133, "learning_rate": 1.5175550560450843e-05, "loss": 0.0012, "num_tokens": 35662142.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7887843571296809, "frac_reward_zero_std": 1.0, "grad_norm": 0.038818359375, "kl": 0.032977010821923614, "learning_rate": 1.5172795241007775e-05, "loss": 0.0013, "num_tokens": 35669281.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7889688249400479, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.04830338363535702, "learning_rate": 1.5170039385283697e-05, "loss": 0.0019, "num_tokens": 35681330.0, "reward": 1.6840277910232544, "reward_std": 0.3826562464237213, "rewards/fixed_code_pass_all_test_reward/mean": 0.6840277910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.3826562762260437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.789153292750415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0576171875, "kl": 0.025045650778338313, "learning_rate": 1.5167282993564316e-05, "loss": 0.001, "num_tokens": 35686925.0, "reward": 1.6710526943206787, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6710526347160339, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.7893377605607822, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.062272934243083, "learning_rate": 1.5164526066135397e-05, "loss": 0.0025, "num_tokens": 35693818.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 524.125, "completions/mean_terminated_length": 524.125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.7895222283711493, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.03580970806069672, "learning_rate": 1.516176860328276e-05, "loss": 0.0014, "num_tokens": 35703635.0, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 107.625, "completions/mean_terminated_length": 107.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.7897066961815163, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.08616695599630475, "learning_rate": 1.5159010605292284e-05, "loss": 0.0034, "num_tokens": 35707304.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7898911639918834, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.06656726030632854, "learning_rate": 1.5156252072449894e-05, "loss": 0.0027, "num_tokens": 35717195.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7900756318022505, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.0351130492053926, "learning_rate": 1.5153493005041578e-05, "loss": 0.0014, "num_tokens": 35722932.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7902600996126176, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.04435131628997624, "learning_rate": 1.5150733403353377e-05, "loss": 0.0018, "num_tokens": 35729509.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 440.5, "completions/mean_terminated_length": 440.5, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7904445674229846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.040356529178097844, "learning_rate": 1.5147973267671394e-05, "loss": 0.0016, "num_tokens": 35739473.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 357.625, "completions/mean_terminated_length": 357.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7906290352333518, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.047102137468755245, "learning_rate": 1.5145212598281776e-05, "loss": 0.0019, "num_tokens": 35750862.0, "reward": 1.734375, "reward_std": 0.16952534019947052, "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, "rewards/fixed_code_pass_all_test_reward/std": 0.16952534019947052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7908135030437189, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.03653145511634648, "learning_rate": 1.514245139547073e-05, "loss": 0.0015, "num_tokens": 35756747.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 341.25, "completions/mean_terminated_length": 341.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.790997970854086, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.015574604272842407, "learning_rate": 1.5139689659524522e-05, "loss": 0.0006, "num_tokens": 35763821.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 513.875, "completions/mean_terminated_length": 513.875, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.791182438664453, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.03578599519096315, "learning_rate": 1.513692739072947e-05, "loss": 0.0014, "num_tokens": 35773532.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.7913669064748201, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.06496074004098773, "learning_rate": 1.5134164589371947e-05, "loss": 0.0026, "num_tokens": 35783285.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 531.5, "completions/mean_terminated_length": 531.5, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.7915513742851872, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.04257419263012707, "learning_rate": 1.5131401255738381e-05, "loss": 0.0017, "num_tokens": 35797921.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7917358420955544, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.0576136929448694, "learning_rate": 1.5128637390115256e-05, "loss": 0.0023, "num_tokens": 35804552.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7919203099059214, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.037702382542192936, "learning_rate": 1.5125872992789119e-05, "loss": 0.0015, "num_tokens": 35808603.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7921047777162885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0306396484375, "kl": 0.017003939487040043, "learning_rate": 1.5123108064046554e-05, "loss": 0.0007, "num_tokens": 35814520.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 611.75, "completions/mean_terminated_length": 611.75, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.7922892455266556, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.036021529231220484, "learning_rate": 1.5120342604174213e-05, "loss": 0.0014, "num_tokens": 35824774.0, "reward": 1.326923131942749, "reward_std": 0.5762816667556763, "rewards/fixed_code_pass_all_test_reward/mean": 0.45192307233810425, "rewards/fixed_code_pass_all_test_reward/std": 0.27924850583076477, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7924737133370227, "frac_reward_zero_std": 0.0, "grad_norm": 0.94921875, "kl": 0.037810812471434474, "learning_rate": 1.5117576613458803e-05, "loss": 0.0015, "num_tokens": 35833844.0, "reward": 1.5, "reward_std": 0.9258201122283936, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 634.25, "completions/mean_terminated_length": 634.25, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.7926581811473897, "frac_reward_zero_std": 0.0, "grad_norm": 0.82421875, "kl": 0.023020552471280098, "learning_rate": 1.5114810092187082e-05, "loss": 0.0009, "num_tokens": 35852398.0, "reward": 1.8020833730697632, "reward_std": 0.3240906596183777, "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7928426489577569, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.02639222051948309, "learning_rate": 1.5112043040645863e-05, "loss": 0.0011, "num_tokens": 35858077.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.793027116768124, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.07339434139430523, "learning_rate": 1.510927545912202e-05, "loss": 0.0029, "num_tokens": 35861942.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.7932115845784911, "frac_reward_zero_std": 0.0, "grad_norm": 3.28125, "kl": 0.2752287737093866, "learning_rate": 1.5106507347902475e-05, "loss": 0.011, "num_tokens": 35866206.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7933960523888581, "frac_reward_zero_std": 1.0, "grad_norm": 0.035400390625, "kl": 0.016725976718589664, "learning_rate": 1.5103738707274205e-05, "loss": 0.0007, "num_tokens": 35871500.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7935805201992252, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.018712026649154723, "learning_rate": 1.5100969537524245e-05, "loss": 0.0007, "num_tokens": 35876728.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7937649880095923, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.057299571577459574, "learning_rate": 1.5098199838939684e-05, "loss": 0.0023, "num_tokens": 35885653.0, "reward": 1.5749999284744263, "reward_std": 0.6363961100578308, "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 627.5, "completions/mean_terminated_length": 627.5, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.7939494558199595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0419921875, "kl": 0.027587792137637734, "learning_rate": 1.5095429611807673e-05, "loss": 0.0011, "num_tokens": 35899769.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7941339236303265, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.02641181240323931, "learning_rate": 1.5092658856415403e-05, "loss": 0.0011, "num_tokens": 35910325.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7943183914406936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0400390625, "kl": 0.049518843181431293, "learning_rate": 1.5089887573050129e-05, "loss": 0.002, "num_tokens": 35919127.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 596.875, "completions/mean_terminated_length": 596.875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.7945028592510607, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.03633172903209925, "learning_rate": 1.5087115761999158e-05, "loss": 0.0015, "num_tokens": 35930350.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.7946873270614278, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.047069798689335585, "learning_rate": 1.5084343423549857e-05, "loss": 0.0019, "num_tokens": 35938413.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7948717948717948, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.03980321902781725, "learning_rate": 1.508157055798964e-05, "loss": 0.0016, "num_tokens": 35948479.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.795056262682162, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.07086195051670074, "learning_rate": 1.507879716560598e-05, "loss": 0.0028, "num_tokens": 35956128.0, "reward": 1.75, "reward_std": 0.26726123690605164, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7952407304925291, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.040794012136757374, "learning_rate": 1.5076023246686407e-05, "loss": 0.0016, "num_tokens": 35966354.0, "reward": 1.7922296524047852, "reward_std": 0.3080655038356781, "rewards/fixed_code_pass_all_test_reward/mean": 0.7922297716140747, "rewards/fixed_code_pass_all_test_reward/std": 0.3080655634403229, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.7954251983028962, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.052502304781228304, "learning_rate": 1.5073248801518499e-05, "loss": 0.0021, "num_tokens": 35974141.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 321.25, "completions/mean_terminated_length": 321.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7956096661132632, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.04905204311944544, "learning_rate": 1.5070473830389892e-05, "loss": 0.002, "num_tokens": 35983167.0, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 418.625, "completions/mean_terminated_length": 418.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7957941339236303, "frac_reward_zero_std": 1.0, "grad_norm": 0.177734375, "kl": 0.07289852318353951, "learning_rate": 1.5067698333588276e-05, "loss": 0.0029, "num_tokens": 35991004.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7959786017339974, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.05602266942150891, "learning_rate": 1.5064922311401403e-05, "loss": 0.0022, "num_tokens": 36001719.0, "reward": 1.7777776718139648, "reward_std": 0.20573778450489044, "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7961630695443646, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.02790191792882979, "learning_rate": 1.5062145764117064e-05, "loss": 0.0011, "num_tokens": 36010948.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7963475373547316, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.030230681411921978, "learning_rate": 1.5059368692023114e-05, "loss": 0.0012, "num_tokens": 36017717.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7965320051650987, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.033905977848917246, "learning_rate": 1.5056591095407465e-05, "loss": 0.0014, "num_tokens": 36024651.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7967164729754658, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.06293100956827402, "learning_rate": 1.5053812974558078e-05, "loss": 0.0025, "num_tokens": 36034345.0, "reward": 1.0833333730697632, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7969009407858328, "frac_reward_zero_std": 1.0, "grad_norm": 0.51171875, "kl": 0.08539120806381106, "learning_rate": 1.5051034329762972e-05, "loss": 0.0034, "num_tokens": 36043839.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7970854085961999, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03186343004927039, "learning_rate": 1.5048255161310215e-05, "loss": 0.0013, "num_tokens": 36056591.0, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7972698764065671, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.037303369492292404, "learning_rate": 1.5045475469487932e-05, "loss": 0.0015, "num_tokens": 36060827.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 213.125, "completions/mean_terminated_length": 213.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7974543442169342, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.03575144917704165, "learning_rate": 1.5042695254584308e-05, "loss": 0.0014, "num_tokens": 36067300.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7976388120273012, "frac_reward_zero_std": 1.0, "grad_norm": 0.056884765625, "kl": 0.043438287219032645, "learning_rate": 1.5039914516887575e-05, "loss": 0.0017, "num_tokens": 36075195.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 439.25, "completions/mean_terminated_length": 439.25, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.7978232798376683, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.060549080139026046, "learning_rate": 1.5037133256686019e-05, "loss": 0.0024, "num_tokens": 36086629.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 682.25, "completions/mean_terminated_length": 682.25, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.7980077476480354, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.021951534552499652, "learning_rate": 1.5034351474267985e-05, "loss": 0.0009, "num_tokens": 36102327.0, "reward": 1.990384578704834, "reward_std": 0.027196446433663368, "rewards/fixed_code_pass_all_test_reward/mean": 0.990384578704834, "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7981922154584025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.03357413015328348, "learning_rate": 1.5031569169921869e-05, "loss": 0.0013, "num_tokens": 36109365.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7983766832687696, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.07525891670957208, "learning_rate": 1.5028786343936123e-05, "loss": 0.003, "num_tokens": 36119326.0, "reward": 1.5892857313156128, "reward_std": 0.0991949662566185, "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.09919500350952148, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 95.25, "completions/mean_terminated_length": 95.25, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7985611510791367, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.030775656923651695, "learning_rate": 1.5026002996599252e-05, "loss": 0.0012, "num_tokens": 36122800.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7987456188895038, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.057522712741047144, "learning_rate": 1.5023219128199813e-05, "loss": 0.0023, "num_tokens": 36132474.0, "reward": 1.7083333730697632, "reward_std": 0.4520675241947174, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.4520675837993622, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7989300866998709, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.05647197691723704, "learning_rate": 1.502043473902642e-05, "loss": 0.0023, "num_tokens": 36139758.0, "reward": 1.8643617630004883, "reward_std": 0.11254923790693283, "rewards/fixed_code_pass_all_test_reward/mean": 0.8643617033958435, "rewards/fixed_code_pass_all_test_reward/std": 0.11254924535751343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.7991145545102379, "frac_reward_zero_std": 1.0, "grad_norm": 0.0517578125, "kl": 0.048513495828956366, "learning_rate": 1.5017649829367737e-05, "loss": 0.0019, "num_tokens": 36152230.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 470.25, "completions/mean_terminated_length": 470.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.799299022320605, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.032952550100162625, "learning_rate": 1.5014864399512487e-05, "loss": 0.0013, "num_tokens": 36160840.0, "reward": 1.8214285373687744, "reward_std": 0.3393528461456299, "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.07393559068441391, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7994834901309722, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.039512885734438896, "learning_rate": 1.501207844974945e-05, "loss": 0.0016, "num_tokens": 36169112.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 435.125, "completions/mean_terminated_length": 435.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7996679579413393, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.033033008221536875, "learning_rate": 1.5009291980367449e-05, "loss": 0.0013, "num_tokens": 36180657.0, "reward": 1.9514925479888916, "reward_std": 0.0764177218079567, "rewards/fixed_code_pass_all_test_reward/mean": 0.9514925479888916, "rewards/fixed_code_pass_all_test_reward/std": 0.07641774415969849, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.7998524257517063, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.03638588031753898, "learning_rate": 1.5006504991655367e-05, "loss": 0.0015, "num_tokens": 36184875.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 450.625, "completions/mean_terminated_length": 450.625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.8000368935620734, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.03963064565323293, "learning_rate": 1.500371748390214e-05, "loss": 0.0016, "num_tokens": 36196160.0, "reward": 1.0475351810455322, "reward_std": 0.5257523655891418, "rewards/fixed_code_pass_all_test_reward/mean": 0.17253521084785461, "rewards/fixed_code_pass_all_test_reward/std": 0.31956180930137634, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 371.625, "completions/mean_terminated_length": 371.625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8002213613724405, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.03471875935792923, "learning_rate": 1.500092945739676e-05, "loss": 0.0014, "num_tokens": 36202701.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8004058291828076, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.1129117519594729, "learning_rate": 1.4998140912428274e-05, "loss": 0.0045, "num_tokens": 36206450.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8005902969931747, "frac_reward_zero_std": 1.0, "grad_norm": 0.046875, "kl": 0.03067458188161254, "learning_rate": 1.4995351849285773e-05, "loss": 0.0012, "num_tokens": 36210801.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 414.875, "completions/mean_terminated_length": 414.875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8007747648035418, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.039575346279889345, "learning_rate": 1.4992562268258413e-05, "loss": 0.0016, "num_tokens": 36222480.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 433.0, "completions/mean_terminated_length": 433.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.8009592326139089, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.044190880842506886, "learning_rate": 1.4989772169635397e-05, "loss": 0.0018, "num_tokens": 36231944.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.801143700424276, "frac_reward_zero_std": 1.0, "grad_norm": 0.044677734375, "kl": 0.017380940495058894, "learning_rate": 1.4986981553705985e-05, "loss": 0.0007, "num_tokens": 36236511.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 459.0, "completions/mean_terminated_length": 459.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.801328168234643, "frac_reward_zero_std": 0.0, "grad_norm": 0.66796875, "kl": 0.014636554347816855, "learning_rate": 1.4984190420759492e-05, "loss": 0.0006, "num_tokens": 36245311.0, "reward": 1.2125000953674316, "reward_std": 0.035355307161808014, "rewards/fixed_code_pass_all_test_reward/mean": 0.21250000596046448, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 250.25, "completions/mean_terminated_length": 250.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8015126360450101, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.041719620348885655, "learning_rate": 1.4981398771085278e-05, "loss": 0.0017, "num_tokens": 36251129.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8016971038553773, "frac_reward_zero_std": 1.0, "grad_norm": 0.046875, "kl": 0.02718952053692192, "learning_rate": 1.4978606604972768e-05, "loss": 0.0011, "num_tokens": 36261226.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8018815716657444, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.05092989862896502, "learning_rate": 1.4975813922711435e-05, "loss": 0.002, "num_tokens": 36271172.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8020660394761114, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.04369087144732475, "learning_rate": 1.4973020724590803e-05, "loss": 0.0017, "num_tokens": 36278156.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8022505072864785, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.055120388278737664, "learning_rate": 1.4970227010900453e-05, "loss": 0.0022, "num_tokens": 36288501.0, "reward": 1.4821428060531616, "reward_std": 0.5460566282272339, "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.3253166079521179, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8024349750968456, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.03757225477602333, "learning_rate": 1.496743278193002e-05, "loss": 0.0015, "num_tokens": 36292262.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.8026194429072127, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.023363763699308038, "learning_rate": 1.496463803796919e-05, "loss": 0.0009, "num_tokens": 36298492.0, "reward": 1.94140625, "reward_std": 0.1657281517982483, "rewards/fixed_code_pass_all_test_reward/mean": 0.94140625, "rewards/fixed_code_pass_all_test_reward/std": 0.1657281517982483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 372.375, "completions/mean_terminated_length": 372.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.8028039107175797, "frac_reward_zero_std": 1.0, "grad_norm": 0.039306640625, "kl": 0.02221129834651947, "learning_rate": 1.4961842779307702e-05, "loss": 0.0009, "num_tokens": 36308911.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8029883785279469, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.04054278717376292, "learning_rate": 1.4959047006235352e-05, "loss": 0.0016, "num_tokens": 36314516.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 458.25, "completions/mean_terminated_length": 458.25, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.803172846338314, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.030884688487276435, "learning_rate": 1.4956250719041987e-05, "loss": 0.0012, "num_tokens": 36322238.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 389.25, "completions/mean_terminated_length": 389.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.8033573141486811, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.039740476524457335, "learning_rate": 1.4953453918017512e-05, "loss": 0.0016, "num_tokens": 36331136.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8035417819590481, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.034326078137382865, "learning_rate": 1.4950656603451868e-05, "loss": 0.0014, "num_tokens": 36337959.0, "reward": 1.3020832538604736, "reward_std": 0.4541202187538147, "rewards/fixed_code_pass_all_test_reward/mean": 0.3020833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.4541202485561371, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8037262497694152, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.12260240619070828, "learning_rate": 1.4947858775635073e-05, "loss": 0.0049, "num_tokens": 36348257.0, "reward": 1.8192567825317383, "reward_std": 0.29588446021080017, "rewards/fixed_code_pass_all_test_reward/mean": 0.8192567825317383, "rewards/fixed_code_pass_all_test_reward/std": 0.29588449001312256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 438.375, "completions/mean_terminated_length": 438.375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8039107175797823, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.04997399100102484, "learning_rate": 1.4945060434857185e-05, "loss": 0.002, "num_tokens": 36364260.0, "reward": 1.78125, "reward_std": 0.41052013635635376, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8040951853901495, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.032463502953760326, "learning_rate": 1.4942261581408315e-05, "loss": 0.0013, "num_tokens": 36370042.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8042796532005165, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.02374658768530935, "learning_rate": 1.493946221557863e-05, "loss": 0.0009, "num_tokens": 36375059.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8044641210108836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.05420694872736931, "learning_rate": 1.4936662337658353e-05, "loss": 0.0022, "num_tokens": 36383654.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 379.5, "completions/mean_terminated_length": 379.5, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8046485888212507, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03056553890928626, "learning_rate": 1.4933861947937754e-05, "loss": 0.0012, "num_tokens": 36393146.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 427.25, "completions/mean_terminated_length": 427.25, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.8048330566316177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.04093357943929732, "learning_rate": 1.4931061046707159e-05, "loss": 0.0016, "num_tokens": 36403284.0, "reward": 1.528735637664795, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5287356376647949, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 348.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8050175244419848, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.13578288117423654, "learning_rate": 1.4928259634256943e-05, "loss": 0.0054, "num_tokens": 36410359.0, "reward": 1.6414835453033447, "reward_std": 0.7222461700439453, "rewards/fixed_code_pass_all_test_reward/mean": 0.7664835453033447, "rewards/fixed_code_pass_all_test_reward/std": 0.4214785695075989, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.805201992252352, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.052997968159615993, "learning_rate": 1.4925457710877545e-05, "loss": 0.0021, "num_tokens": 36414148.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8053864600627191, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.06247329921461642, "learning_rate": 1.4922655276859446e-05, "loss": 0.0025, "num_tokens": 36422069.0, "reward": 1.625, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8055709278730862, "frac_reward_zero_std": 1.0, "grad_norm": 0.055419921875, "kl": 0.03539038309827447, "learning_rate": 1.4919852332493183e-05, "loss": 0.0014, "num_tokens": 36426359.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8057553956834532, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.024879404110834002, "learning_rate": 1.4917048878069348e-05, "loss": 0.001, "num_tokens": 36436557.0, "reward": 1.7048193216323853, "reward_std": 0.12421075254678726, "rewards/fixed_code_pass_all_test_reward/mean": 0.7048192620277405, "rewards/fixed_code_pass_all_test_reward/std": 0.12421079725027084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 425.5, "completions/mean_terminated_length": 425.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8059398634938203, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.06623038137331605, "learning_rate": 1.4914244913878584e-05, "loss": 0.0026, "num_tokens": 36444113.0, "reward": 1.4166666269302368, "reward_std": 0.45730987191200256, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, "rewards/fixed_code_pass_all_test_reward/std": 0.45730993151664734, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 788.125, "completions/mean_terminated_length": 788.125, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.8061243313041874, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.026985066826455295, "learning_rate": 1.4911440440211592e-05, "loss": 0.0011, "num_tokens": 36457578.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.8063087991145546, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.049860212951898575, "learning_rate": 1.4908635457359112e-05, "loss": 0.002, "num_tokens": 36463418.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 332.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8064932669249216, "frac_reward_zero_std": 1.0, "grad_norm": 0.033203125, "kl": 0.018359996378421783, "learning_rate": 1.490582996561195e-05, "loss": 0.0007, "num_tokens": 36471894.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8066777347352887, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03707440476864576, "learning_rate": 1.4903023965260963e-05, "loss": 0.0015, "num_tokens": 36477113.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8068622025456558, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.029892700724303722, "learning_rate": 1.4900217456597059e-05, "loss": 0.0012, "num_tokens": 36481524.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8070466703560228, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.05107171507552266, "learning_rate": 1.4897410439911192e-05, "loss": 0.002, "num_tokens": 36491487.0, "reward": 1.8785715103149414, "reward_std": 0.3434518277645111, "rewards/fixed_code_pass_all_test_reward/mean": 0.8785714507102966, "rewards/fixed_code_pass_all_test_reward/std": 0.3434518575668335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 656.0, "completions/mean_terminated_length": 656.0, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.8072311381663899, "frac_reward_zero_std": 0.0, "grad_norm": 0.76953125, "kl": 0.03920824360102415, "learning_rate": 1.4894602915494382e-05, "loss": 0.0016, "num_tokens": 36503759.0, "reward": 1.1416666507720947, "reward_std": 0.1433720737695694, "rewards/fixed_code_pass_all_test_reward/mean": 0.14166668057441711, "rewards/fixed_code_pass_all_test_reward/std": 0.14337210357189178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8074156059767571, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.054348713252693415, "learning_rate": 1.4891794883637692e-05, "loss": 0.0022, "num_tokens": 36512334.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 445.0, "completions/mean_terminated_length": 445.0, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.8076000737871242, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.025964933389332145, "learning_rate": 1.4888986344632239e-05, "loss": 0.001, "num_tokens": 36521030.0, "reward": 1.9642857313156128, "reward_std": 0.10101523250341415, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8077845415974912, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.09122531954199076, "learning_rate": 1.4886177298769192e-05, "loss": 0.0036, "num_tokens": 36524919.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 375.875, "completions/mean_terminated_length": 375.875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8079690094078583, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.05263061635196209, "learning_rate": 1.4883367746339778e-05, "loss": 0.0021, "num_tokens": 36532342.0, "reward": 1.9076087474822998, "reward_std": 0.13030529022216797, "rewards/fixed_code_pass_all_test_reward/mean": 0.9076087474822998, "rewards/fixed_code_pass_all_test_reward/std": 0.13030532002449036, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8081534772182254, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.031326914206147194, "learning_rate": 1.4880557687635269e-05, "loss": 0.0013, "num_tokens": 36538970.0, "reward": 1.9038461446762085, "reward_std": 0.2719641625881195, "rewards/fixed_code_pass_all_test_reward/mean": 0.9038461446762085, "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.8083379450285925, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0400226708734408, "learning_rate": 1.4877747122947e-05, "loss": 0.0016, "num_tokens": 36543813.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8085224128389596, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.047981366980820894, "learning_rate": 1.487493605256634e-05, "loss": 0.0019, "num_tokens": 36548865.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 246.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8087068806493267, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.04247316438704729, "learning_rate": 1.4872124476784734e-05, "loss": 0.0017, "num_tokens": 36556607.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 292.125, "completions/mean_terminated_length": 292.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8088913484596938, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.06693923333659768, "learning_rate": 1.486931239589366e-05, "loss": 0.0027, "num_tokens": 36565520.0, "reward": 1.1111111640930176, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.1111111119389534, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8090758162700609, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.02382931101601571, "learning_rate": 1.4866499810184662e-05, "loss": 0.001, "num_tokens": 36569705.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 350.625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8092602840804279, "frac_reward_zero_std": 1.0, "grad_norm": 0.045166015625, "kl": 0.045349675696343184, "learning_rate": 1.4863686719949321e-05, "loss": 0.0018, "num_tokens": 36576966.0, "reward": 1.5384615659713745, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5384615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 367.0, "completions/mean_terminated_length": 367.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.809444751890795, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.04114903451409191, "learning_rate": 1.486087312547929e-05, "loss": 0.0016, "num_tokens": 36586294.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 415.25, "completions/mean_terminated_length": 415.25, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8096292197011622, "frac_reward_zero_std": 0.0, "grad_norm": 0.9140625, "kl": 0.023470085579901934, "learning_rate": 1.4858059027066256e-05, "loss": 0.0009, "num_tokens": 36595032.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 435.125, "completions/mean_terminated_length": 435.125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.8098136875115293, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.0530205462127924, "learning_rate": 1.485524442500197e-05, "loss": 0.0021, "num_tokens": 36607097.0, "reward": 1.5333333015441895, "reward_std": 0.41975051164627075, "rewards/fixed_code_pass_all_test_reward/mean": 0.5333333015441895, "rewards/fixed_code_pass_all_test_reward/std": 0.41975051164627075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8099981553218963, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.07789242453873158, "learning_rate": 1.485242931957823e-05, "loss": 0.0031, "num_tokens": 36618089.0, "reward": 1.7083333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 381.625, "completions/mean_terminated_length": 381.625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.8101826231322634, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.05858763330616057, "learning_rate": 1.4849613711086885e-05, "loss": 0.0023, "num_tokens": 36625870.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8103670909426305, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.016684241592884064, "learning_rate": 1.4846797599819844e-05, "loss": 0.0007, "num_tokens": 36631005.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 628.0, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.8105515587529976, "frac_reward_zero_std": 0.0, "grad_norm": 0.6875, "kl": 0.03264805069193244, "learning_rate": 1.4843980986069058e-05, "loss": 0.0013, "num_tokens": 36645437.0, "reward": 1.894230842590332, "reward_std": 0.29916054010391235, "rewards/fixed_code_pass_all_test_reward/mean": 0.8942307829856873, "rewards/fixed_code_pass_all_test_reward/std": 0.29916056990623474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 389.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8107360265633647, "frac_reward_zero_std": 1.0, "grad_norm": 0.033203125, "kl": 0.015926459687761962, "learning_rate": 1.4841163870126533e-05, "loss": 0.0006, "num_tokens": 36652990.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 367.25, "completions/mean_terminated_length": 367.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8109204943737318, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.03767395415343344, "learning_rate": 1.4838346252284337e-05, "loss": 0.0015, "num_tokens": 36660320.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 487.875, "completions/mean_terminated_length": 487.875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.8111049621840989, "frac_reward_zero_std": 0.0, "grad_norm": 0.63671875, "kl": 0.024379739305004478, "learning_rate": 1.4835528132834579e-05, "loss": 0.001, "num_tokens": 36671607.0, "reward": 1.2467105388641357, "reward_std": 0.25120896100997925, "rewards/fixed_code_pass_all_test_reward/mean": 0.24671052396297455, "rewards/fixed_code_pass_all_test_reward/std": 0.25120899081230164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.811289429994466, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.06366124947089702, "learning_rate": 1.4832709512069414e-05, "loss": 0.0025, "num_tokens": 36679961.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 503.25, "completions/mean_terminated_length": 503.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.811473897804833, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.038279904052615166, "learning_rate": 1.482989039028107e-05, "loss": 0.0015, "num_tokens": 36690275.0, "reward": 1.6666667461395264, "reward_std": 0.17817415297031403, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 526.875, "completions/mean_terminated_length": 526.875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.8116583656152001, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.03175914043094963, "learning_rate": 1.4827070767761806e-05, "loss": 0.0013, "num_tokens": 36703306.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 464.625, "completions/mean_terminated_length": 464.625, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.8118428334255673, "frac_reward_zero_std": 1.0, "grad_norm": 0.2177734375, "kl": 0.026326433988288045, "learning_rate": 1.4824250644803951e-05, "loss": 0.0011, "num_tokens": 36712399.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 410.0, "completions/mean_terminated_length": 410.0, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.8120273012359344, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.03551439684815705, "learning_rate": 1.4821430021699865e-05, "loss": 0.0014, "num_tokens": 36724071.0, "reward": 1.8858695030212402, "reward_std": 0.3228096067905426, "rewards/fixed_code_pass_all_test_reward/mean": 0.885869562625885, "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 572.5, "completions/mean_terminated_length": 361.71429443359375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8122117690463014, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03990285762120038, "learning_rate": 1.4818608898741982e-05, "loss": 0.0016, "num_tokens": 36733731.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 585.625, "completions/mean_terminated_length": 585.625, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.8123962368566685, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.055546588730067015, "learning_rate": 1.4815787276222768e-05, "loss": 0.0022, "num_tokens": 36749432.0, "reward": 1.6796875, "reward_std": 0.7066627144813538, "rewards/fixed_code_pass_all_test_reward/mean": 0.8046875, "rewards/fixed_code_pass_all_test_reward/std": 0.380080908536911, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8125807046670356, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.04892460582777858, "learning_rate": 1.481296515443476e-05, "loss": 0.002, "num_tokens": 36755513.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8127651724774027, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.06480020051822066, "learning_rate": 1.4810142533670526e-05, "loss": 0.0026, "num_tokens": 36760880.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 423.5, "completions/mean_terminated_length": 423.5, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.8129496402877698, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.062073503620922565, "learning_rate": 1.4807319414222708e-05, "loss": 0.0025, "num_tokens": 36771420.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 386.875, "completions/mean_terminated_length": 386.875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.8131341080981369, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.11369953537359834, "learning_rate": 1.4804495796383977e-05, "loss": 0.0045, "num_tokens": 36782267.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.813318575908504, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.0514680533669889, "learning_rate": 1.4801671680447079e-05, "loss": 0.0021, "num_tokens": 36786428.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 396.5, "completions/mean_terminated_length": 396.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.813503043718871, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.04786990396678448, "learning_rate": 1.4798847066704785e-05, "loss": 0.0019, "num_tokens": 36797000.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8136875115292381, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.04091847641393542, "learning_rate": 1.4796021955449943e-05, "loss": 0.0016, "num_tokens": 36805182.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8138719793396052, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.04209016659297049, "learning_rate": 1.479319634697544e-05, "loss": 0.0017, "num_tokens": 36814164.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8140564471499724, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.03043151763267815, "learning_rate": 1.4790370241574214e-05, "loss": 0.0012, "num_tokens": 36818150.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 283.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8142409149603395, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.04518334660679102, "learning_rate": 1.4787543639539257e-05, "loss": 0.0018, "num_tokens": 36826428.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8144253827707065, "frac_reward_zero_std": 1.0, "grad_norm": 0.048095703125, "kl": 0.0336305710952729, "learning_rate": 1.4784716541163615e-05, "loss": 0.0013, "num_tokens": 36834669.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.8146098505810736, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.039790721610188484, "learning_rate": 1.478188894674038e-05, "loss": 0.0016, "num_tokens": 36843576.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8147943183914407, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0753085152246058, "learning_rate": 1.47790608565627e-05, "loss": 0.003, "num_tokens": 36850193.0, "reward": 1.8434065580368042, "reward_std": 0.2161194235086441, "rewards/fixed_code_pass_all_test_reward/mean": 0.8434065580368042, "rewards/fixed_code_pass_all_test_reward/std": 0.2161194235086441, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 465.375, "completions/mean_terminated_length": 465.375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.8149787862018077, "frac_reward_zero_std": 0.0, "grad_norm": 0.81640625, "kl": 0.033116265665739775, "learning_rate": 1.4776232270923771e-05, "loss": 0.0013, "num_tokens": 36861916.0, "reward": 1.8494318723678589, "reward_std": 0.24566414952278137, "rewards/fixed_code_pass_all_test_reward/mean": 0.8494318723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.24566416442394257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8151632540121748, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.04929734254255891, "learning_rate": 1.4773403190116845e-05, "loss": 0.002, "num_tokens": 36874172.0, "reward": 1.454545497894287, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.815347721822542, "frac_reward_zero_std": 1.0, "grad_norm": 0.51171875, "kl": 0.07296629901975393, "learning_rate": 1.4770573614435218e-05, "loss": 0.0029, "num_tokens": 36881533.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 529.75, "completions/mean_terminated_length": 529.75, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.8155321896329091, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04199667274951935, "learning_rate": 1.476774354417224e-05, "loss": 0.0017, "num_tokens": 36891859.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 361.0, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.8157166574432761, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.029323241906240582, "learning_rate": 1.4764912979621321e-05, "loss": 0.0012, "num_tokens": 36899139.0, "reward": 1.8382352590560913, "reward_std": 0.174357071518898, "rewards/fixed_code_pass_all_test_reward/mean": 0.8382353186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.1743570864200592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8159011252536432, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.02728944446425885, "learning_rate": 1.4762081921075912e-05, "loss": 0.0011, "num_tokens": 36906275.0, "reward": 1.9886363744735718, "reward_std": 0.03214118629693985, "rewards/fixed_code_pass_all_test_reward/mean": 0.9886363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 373.75, "completions/mean_terminated_length": 373.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8160855930640103, "frac_reward_zero_std": 1.0, "grad_norm": 0.061767578125, "kl": 0.05139522533863783, "learning_rate": 1.4759250368829519e-05, "loss": 0.0021, "num_tokens": 36915193.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8162700608743774, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.029186043422669172, "learning_rate": 1.4756418323175691e-05, "loss": 0.0012, "num_tokens": 36920511.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8164545286847446, "frac_reward_zero_std": 1.0, "grad_norm": 0.043701171875, "kl": 0.031119376653805375, "learning_rate": 1.4753585784408049e-05, "loss": 0.0012, "num_tokens": 36930364.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 420.875, "completions/mean_terminated_length": 420.875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8166389964951116, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.040793149964883924, "learning_rate": 1.475075275282024e-05, "loss": 0.0016, "num_tokens": 36938603.0, "reward": 1.25, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8168234643054787, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.04774754913523793, "learning_rate": 1.4747919228705982e-05, "loss": 0.0019, "num_tokens": 36949549.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 484.0, "completions/mean_terminated_length": 484.0, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.8170079321158458, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.027475686511024833, "learning_rate": 1.474508521235903e-05, "loss": 0.0011, "num_tokens": 36961421.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8171923999262128, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.060472844168543816, "learning_rate": 1.4742250704073199e-05, "loss": 0.0024, "num_tokens": 36969468.0, "reward": 1.8017241954803467, "reward_std": 0.3834303915500641, "rewards/fixed_code_pass_all_test_reward/mean": 0.8017241358757019, "rewards/fixed_code_pass_all_test_reward/std": 0.3834303915500641, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8173768677365799, "frac_reward_zero_std": 0.0, "grad_norm": 5.84375, "kl": 0.26671713683754206, "learning_rate": 1.4739415704142352e-05, "loss": 0.0107, "num_tokens": 36979213.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8175613355469471, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.06681014341302216, "learning_rate": 1.4736580212860405e-05, "loss": 0.0027, "num_tokens": 36983604.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8177458033573142, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0629424867220223, "learning_rate": 1.4733744230521314e-05, "loss": 0.0025, "num_tokens": 36990185.0, "reward": 1.1304347515106201, "reward_std": 0.1859208345413208, "rewards/fixed_code_pass_all_test_reward/mean": 0.1304347813129425, "rewards/fixed_code_pass_all_test_reward/std": 0.1859208643436432, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8179302711676812, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.04263966716825962, "learning_rate": 1.4730907757419108e-05, "loss": 0.0017, "num_tokens": 36998510.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 404.625, "completions/mean_terminated_length": 404.625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.8181147389780483, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.02421852620318532, "learning_rate": 1.4728070793847842e-05, "loss": 0.001, "num_tokens": 37006195.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 225.875, "completions/mean_terminated_length": 225.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8182992067884154, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0183974995452445, "learning_rate": 1.4725233340101641e-05, "loss": 0.0007, "num_tokens": 37011738.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8184836745987825, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.05790543183684349, "learning_rate": 1.4722395396474668e-05, "loss": 0.0023, "num_tokens": 37018754.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8186681424091496, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.04577395226806402, "learning_rate": 1.4719556963261148e-05, "loss": 0.0018, "num_tokens": 37026687.0, "reward": 1.6875, "reward_std": 0.2587745785713196, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 342.875, "completions/mean_terminated_length": 342.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8188526102195167, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.06463017547503114, "learning_rate": 1.4716718040755346e-05, "loss": 0.0026, "num_tokens": 37036982.0, "reward": 1.466397762298584, "reward_std": 0.3072623610496521, "rewards/fixed_code_pass_all_test_reward/mean": 0.46639782190322876, "rewards/fixed_code_pass_all_test_reward/std": 0.3072623908519745, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 453.125, "completions/mean_terminated_length": 453.125, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.8190370780298838, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.05264505883678794, "learning_rate": 1.4713878629251584e-05, "loss": 0.0021, "num_tokens": 37045287.0, "reward": 1.7259615659713745, "reward_std": 0.45295941829681396, "rewards/fixed_code_pass_all_test_reward/mean": 0.7259615659713745, "rewards/fixed_code_pass_all_test_reward/std": 0.45295944809913635, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8192215458402509, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.048185139428824186, "learning_rate": 1.4711038729044233e-05, "loss": 0.0019, "num_tokens": 37049804.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8194060136506179, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.052363231778144836, "learning_rate": 1.4708198340427719e-05, "loss": 0.0021, "num_tokens": 37055841.0, "reward": 1.3636363744735718, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.819590481460985, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.032014787779189646, "learning_rate": 1.4705357463696509e-05, "loss": 0.0013, "num_tokens": 37064006.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8197749492713522, "frac_reward_zero_std": 1.0, "grad_norm": 0.050048828125, "kl": 0.023892402183264494, "learning_rate": 1.4702516099145126e-05, "loss": 0.001, "num_tokens": 37072773.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8199594170817193, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.05666707525961101, "learning_rate": 1.4699674247068147e-05, "loss": 0.0023, "num_tokens": 37081394.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8201438848920863, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.05196774681098759, "learning_rate": 1.4696831907760198e-05, "loss": 0.0021, "num_tokens": 37087676.0, "reward": 1.7737069129943848, "reward_std": 0.1458437144756317, "rewards/fixed_code_pass_all_test_reward/mean": 0.7737069129943848, "rewards/fixed_code_pass_all_test_reward/std": 0.14584369957447052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 369.25, "completions/mean_terminated_length": 369.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8203283527024534, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.028647802071645856, "learning_rate": 1.469398908151595e-05, "loss": 0.0011, "num_tokens": 37094806.0, "reward": 1.4166667461395264, "reward_std": 0.235702246427536, "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8205128205128205, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.07041107444092631, "learning_rate": 1.4691145768630128e-05, "loss": 0.0028, "num_tokens": 37098970.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8206972883231876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.05504255369305611, "learning_rate": 1.4688301969397511e-05, "loss": 0.0022, "num_tokens": 37105614.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.8208817561335547, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.05268324865028262, "learning_rate": 1.4685457684112925e-05, "loss": 0.0021, "num_tokens": 37115061.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 314.75, "completions/mean_terminated_length": 314.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.8210662239439218, "frac_reward_zero_std": 1.0, "grad_norm": 3.546875, "kl": 0.2285073499660939, "learning_rate": 1.4682612913071244e-05, "loss": 0.0091, "num_tokens": 37122803.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8212506917542889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0284423828125, "kl": 0.02116668305825442, "learning_rate": 1.4679767656567392e-05, "loss": 0.0008, "num_tokens": 37130103.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 504.125, "completions/mean_terminated_length": 504.125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.821435159564656, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.03326649544760585, "learning_rate": 1.4676921914896355e-05, "loss": 0.0013, "num_tokens": 37139632.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.821619627375023, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.05363059765659273, "learning_rate": 1.4674075688353155e-05, "loss": 0.0021, "num_tokens": 37144271.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8218040951853901, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.04232618445530534, "learning_rate": 1.4671228977232871e-05, "loss": 0.0017, "num_tokens": 37151348.0, "reward": 1.8010203838348389, "reward_std": 0.14170719683170319, "rewards/fixed_code_pass_all_test_reward/mean": 0.8010203838348389, "rewards/fixed_code_pass_all_test_reward/std": 0.1417071521282196, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8219885629957573, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.03401131136342883, "learning_rate": 1.466838178183063e-05, "loss": 0.0014, "num_tokens": 37156471.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 501.75, "completions/mean_terminated_length": 501.75, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.8221730308061244, "frac_reward_zero_std": 0.0, "grad_norm": 0.796875, "kl": 0.03302102710586041, "learning_rate": 1.4665534102441611e-05, "loss": 0.0013, "num_tokens": 37168013.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 520.375, "completions/mean_terminated_length": 520.375, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.8223574986164914, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.020508441841229796, "learning_rate": 1.4662685939361043e-05, "loss": 0.0008, "num_tokens": 37180696.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8225419664268585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0390625, "kl": 0.025889663957059383, "learning_rate": 1.4659837292884204e-05, "loss": 0.001, "num_tokens": 37186829.0, "reward": 1.8793103694915771, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.8793103694915771, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8227264342372256, "frac_reward_zero_std": 1.0, "grad_norm": 0.04443359375, "kl": 0.026947322534397244, "learning_rate": 1.4656988163306422e-05, "loss": 0.0011, "num_tokens": 37192672.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8229109020475927, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.06098004523664713, "learning_rate": 1.465413855092308e-05, "loss": 0.0024, "num_tokens": 37202269.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 364.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.8230953698579598, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.018815435119904578, "learning_rate": 1.4651288456029602e-05, "loss": 0.0008, "num_tokens": 37209869.0, "reward": 1.93478262424469, "reward_std": 0.040253035724163055, "rewards/fixed_code_pass_all_test_reward/mean": 0.9347826242446899, "rewards/fixed_code_pass_all_test_reward/std": 0.040253039449453354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 415.375, "completions/mean_terminated_length": 415.375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.8232798376683269, "frac_reward_zero_std": 1.0, "grad_norm": 0.03125, "kl": 0.02236573596019298, "learning_rate": 1.4648437878921466e-05, "loss": 0.0009, "num_tokens": 37218288.0, "reward": 1.476190447807312, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.4761904776096344, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.823464305478694, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.02983657654840499, "learning_rate": 1.4645586819894204e-05, "loss": 0.0012, "num_tokens": 37222719.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.823648773289061, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.024180538137443364, "learning_rate": 1.4642735279243398e-05, "loss": 0.001, "num_tokens": 37228740.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 394.875, "completions/mean_terminated_length": 394.875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.8238332410994281, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.05122938007116318, "learning_rate": 1.4639883257264669e-05, "loss": 0.002, "num_tokens": 37237939.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8240177089097952, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.031123207532800734, "learning_rate": 1.4637030754253703e-05, "loss": 0.0012, "num_tokens": 37243967.0, "reward": 1.9874999523162842, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.987500011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0353553481400013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 398.375, "completions/mean_terminated_length": 398.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.8242021767201624, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.0388511479832232, "learning_rate": 1.463417777050622e-05, "loss": 0.0016, "num_tokens": 37254458.0, "reward": 1.7213854789733887, "reward_std": 0.05882582813501358, "rewards/fixed_code_pass_all_test_reward/mean": 0.7213855385780334, "rewards/fixed_code_pass_all_test_reward/std": 0.05882588401436806, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8243866445305295, "frac_reward_zero_std": 1.0, "grad_norm": 0.061767578125, "kl": 0.040384803898632526, "learning_rate": 1.4631324306318009e-05, "loss": 0.0016, "num_tokens": 37260095.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8245711123408965, "frac_reward_zero_std": 1.0, "grad_norm": 0.053466796875, "kl": 0.025733416783623397, "learning_rate": 1.462847036198489e-05, "loss": 0.001, "num_tokens": 37265827.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 237.25, "completions/mean_terminated_length": 237.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8247555801512636, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.0459861836861819, "learning_rate": 1.4625615937802745e-05, "loss": 0.0018, "num_tokens": 37271613.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 324.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8249400479616307, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.051257923943921924, "learning_rate": 1.4622761034067499e-05, "loss": 0.0021, "num_tokens": 37278610.0, "reward": 1.5714285373687744, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8251245157719977, "frac_reward_zero_std": 1.0, "grad_norm": 0.056396484375, "kl": 0.04899001447483897, "learning_rate": 1.4619905651075132e-05, "loss": 0.002, "num_tokens": 37287809.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8253089835823649, "frac_reward_zero_std": 0.0, "grad_norm": 0.87890625, "kl": 0.02402504440397024, "learning_rate": 1.461704978912167e-05, "loss": 0.001, "num_tokens": 37292688.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.825493451392732, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.04842424509115517, "learning_rate": 1.4614193448503189e-05, "loss": 0.0019, "num_tokens": 37300957.0, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8256779192030991, "frac_reward_zero_std": 1.0, "grad_norm": 0.2431640625, "kl": 0.08185578417032957, "learning_rate": 1.4611336629515818e-05, "loss": 0.0033, "num_tokens": 37307206.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8258623870134661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.02131652587559074, "learning_rate": 1.4608479332455729e-05, "loss": 0.0009, "num_tokens": 37311381.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8260468548238332, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.020436199265532196, "learning_rate": 1.460562155761915e-05, "loss": 0.0008, "num_tokens": 37321226.0, "reward": 1.9276316165924072, "reward_std": 0.13400031626224518, "rewards/fixed_code_pass_all_test_reward/mean": 0.9276315569877625, "rewards/fixed_code_pass_all_test_reward/std": 0.1340002864599228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8262313226342003, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.050412192242220044, "learning_rate": 1.4602763305302357e-05, "loss": 0.002, "num_tokens": 37331387.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8264157904445675, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.06437540682964027, "learning_rate": 1.4599904575801673e-05, "loss": 0.0026, "num_tokens": 37335589.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 448.375, "completions/mean_terminated_length": 448.375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.8266002582549346, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.03262524155434221, "learning_rate": 1.4597045369413471e-05, "loss": 0.0013, "num_tokens": 37344272.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8267847260653016, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.050574913155287504, "learning_rate": 1.4594185686434176e-05, "loss": 0.002, "num_tokens": 37352250.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 371.625, "completions/mean_terminated_length": 371.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.8269691938756687, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.03438901528716087, "learning_rate": 1.4591325527160262e-05, "loss": 0.0014, "num_tokens": 37362359.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8271536616860358, "frac_reward_zero_std": 1.0, "grad_norm": 0.28515625, "kl": 0.05558170052245259, "learning_rate": 1.4588464891888252e-05, "loss": 0.0022, "num_tokens": 37369624.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8273381294964028, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.03641491453163326, "learning_rate": 1.4585603780914714e-05, "loss": 0.0015, "num_tokens": 37373443.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 466.25, "completions/mean_terminated_length": 466.25, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.8275225973067699, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.035306291887536645, "learning_rate": 1.458274219453627e-05, "loss": 0.0014, "num_tokens": 37384685.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8277070651171371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458984375, "kl": 0.025669215945526958, "learning_rate": 1.457988013304959e-05, "loss": 0.001, "num_tokens": 37389776.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 100.625, "completions/mean_terminated_length": 100.625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.8278915329275042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.040187884122133255, "learning_rate": 1.4577017596751399e-05, "loss": 0.0016, "num_tokens": 37393285.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 405.0, "completions/mean_terminated_length": 405.0, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.8280760007378712, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.0373152126558125, "learning_rate": 1.4574154585938458e-05, "loss": 0.0015, "num_tokens": 37401221.0, "reward": 1.8223683834075928, "reward_std": 0.24515488743782043, "rewards/fixed_code_pass_all_test_reward/mean": 0.8223684430122375, "rewards/fixed_code_pass_all_test_reward/std": 0.24515485763549805, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8282604685482383, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.055354381911456585, "learning_rate": 1.457129110090759e-05, "loss": 0.0022, "num_tokens": 37410785.0, "reward": 1.703125, "reward_std": 0.3199993073940277, "rewards/fixed_code_pass_all_test_reward/mean": 0.703125, "rewards/fixed_code_pass_all_test_reward/std": 0.3199993073940277, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8284449363586054, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.04033209034241736, "learning_rate": 1.4568427141955656e-05, "loss": 0.0016, "num_tokens": 37419099.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.8286294041689725, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.0659853364340961, "learning_rate": 1.4565562709379581e-05, "loss": 0.0026, "num_tokens": 37426094.0, "reward": 1.8263888359069824, "reward_std": 0.32146528363227844, "rewards/fixed_code_pass_all_test_reward/mean": 0.8263888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.3214653432369232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8288138719793396, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.060248988680541515, "learning_rate": 1.4562697803476325e-05, "loss": 0.0024, "num_tokens": 37437484.0, "reward": 1.2053570747375488, "reward_std": 0.11729146540164948, "rewards/fixed_code_pass_all_test_reward/mean": 0.205357164144516, "rewards/fixed_code_pass_all_test_reward/std": 0.11729148030281067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8289983397897067, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.05441493587568402, "learning_rate": 1.4559832424542901e-05, "loss": 0.0022, "num_tokens": 37443550.0, "reward": 1.225000023841858, "reward_std": 0.2492847442626953, "rewards/fixed_code_pass_all_test_reward/mean": 0.22500000894069672, "rewards/fixed_code_pass_all_test_reward/std": 0.24928469955921173, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8291828076000738, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.04032497154548764, "learning_rate": 1.4556966572876377e-05, "loss": 0.0016, "num_tokens": 37451073.0, "reward": 1.0, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.8293672754104409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0595703125, "kl": 0.023513261345215142, "learning_rate": 1.4554100248773863e-05, "loss": 0.0009, "num_tokens": 37456085.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8295517432208079, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04222102393396199, "learning_rate": 1.4551233452532518e-05, "loss": 0.0017, "num_tokens": 37462326.0, "reward": 1.7702206373214722, "reward_std": 0.10918562859296799, "rewards/fixed_code_pass_all_test_reward/mean": 0.7702206373214722, "rewards/fixed_code_pass_all_test_reward/std": 0.1091856062412262, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.829736211031175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0419921875, "kl": 0.022089880891144276, "learning_rate": 1.4548366184449556e-05, "loss": 0.0009, "num_tokens": 37469354.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.8299206788415422, "frac_reward_zero_std": 1.0, "grad_norm": 0.73828125, "kl": 0.09100817097350955, "learning_rate": 1.4545498444822237e-05, "loss": 0.0036, "num_tokens": 37473766.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8301051466519093, "frac_reward_zero_std": 1.0, "grad_norm": 0.047119140625, "kl": 0.022257568780332804, "learning_rate": 1.4542630233947868e-05, "loss": 0.0009, "num_tokens": 37480872.0, "reward": 1.375, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8302896144622763, "frac_reward_zero_std": 0.0, "grad_norm": 2.859375, "kl": 0.0736381453461945, "learning_rate": 1.4539761552123803e-05, "loss": 0.0029, "num_tokens": 37484763.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8304740822726434, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.04495581751689315, "learning_rate": 1.4536892399647449e-05, "loss": 0.0018, "num_tokens": 37488909.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.8306585500830105, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.048828182741999626, "learning_rate": 1.4534022776816264e-05, "loss": 0.002, "num_tokens": 37499287.0, "reward": 1.8954546451568604, "reward_std": 0.03856949508190155, "rewards/fixed_code_pass_all_test_reward/mean": 0.8954545259475708, "rewards/fixed_code_pass_all_test_reward/std": 0.03856946527957916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.8308430178933776, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.033572531305253506, "learning_rate": 1.4531152683927749e-05, "loss": 0.0013, "num_tokens": 37506220.0, "reward": 1.960714340209961, "reward_std": 0.08215394616127014, "rewards/fixed_code_pass_all_test_reward/mean": 0.9607142806053162, "rewards/fixed_code_pass_all_test_reward/std": 0.08215394616127014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.8310274857037447, "frac_reward_zero_std": 1.0, "grad_norm": 0.515625, "kl": 0.09441158547997475, "learning_rate": 1.4528282121279455e-05, "loss": 0.0038, "num_tokens": 37515459.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.8312119535141118, "frac_reward_zero_std": 1.0, "grad_norm": 0.205078125, "kl": 0.08699578884989023, "learning_rate": 1.4525411089168986e-05, "loss": 0.0035, "num_tokens": 37522272.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8313964213244789, "frac_reward_zero_std": 1.0, "grad_norm": 0.173828125, "kl": 0.04309878475032747, "learning_rate": 1.4522539587893988e-05, "loss": 0.0017, "num_tokens": 37526673.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.831580889134846, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.04440365731716156, "learning_rate": 1.4519667617752164e-05, "loss": 0.0018, "num_tokens": 37532961.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.831765356945213, "frac_reward_zero_std": 1.0, "grad_norm": 0.12158203125, "kl": 0.05035535246133804, "learning_rate": 1.4516795179041255e-05, "loss": 0.002, "num_tokens": 37543174.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 466.625, "completions/mean_terminated_length": 466.625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.8319498247555801, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.03411020780913532, "learning_rate": 1.451392227205906e-05, "loss": 0.0014, "num_tokens": 37552275.0, "reward": 1.723557710647583, "reward_std": 0.3541412949562073, "rewards/fixed_code_pass_all_test_reward/mean": 0.723557710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.35414132475852966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8321342925659473, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.02259766194038093, "learning_rate": 1.4511048897103423e-05, "loss": 0.0009, "num_tokens": 37557771.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8323187603763144, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.05405281111598015, "learning_rate": 1.4508175054472233e-05, "loss": 0.0022, "num_tokens": 37563443.0, "reward": 1.965517282485962, "reward_std": 0.0975319966673851, "rewards/fixed_code_pass_all_test_reward/mean": 0.9655172228813171, "rewards/fixed_code_pass_all_test_reward/std": 0.09753198176622391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 364.625, "completions/mean_terminated_length": 364.625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.8325032281866814, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.08603674033656716, "learning_rate": 1.4505300744463435e-05, "loss": 0.0034, "num_tokens": 37574208.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8326876959970485, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.025676840625237674, "learning_rate": 1.4502425967375016e-05, "loss": 0.001, "num_tokens": 37581985.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 379.5, "completions/mean_terminated_length": 379.5, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.8328721638074156, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.01952862663893029, "learning_rate": 1.4499550723505014e-05, "loss": 0.0008, "num_tokens": 37590389.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8330566316177827, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.06135472655296326, "learning_rate": 1.4496675013151516e-05, "loss": 0.0025, "num_tokens": 37600643.0, "reward": 1.9583333730697632, "reward_std": 0.11785109341144562, "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 462.375, "completions/mean_terminated_length": 462.375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.8332410994281498, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.048348306911066175, "learning_rate": 1.4493798836612656e-05, "loss": 0.0019, "num_tokens": 37609910.0, "reward": 1.8250000476837158, "reward_std": 0.36154431104660034, "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8334255672385169, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.05179552035406232, "learning_rate": 1.4490922194186614e-05, "loss": 0.0021, "num_tokens": 37618261.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.833610035048884, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.04271516017615795, "learning_rate": 1.4488045086171629e-05, "loss": 0.0017, "num_tokens": 37623868.0, "reward": 1.96875, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 621.25, "completions/mean_terminated_length": 621.25, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.833794502859251, "frac_reward_zero_std": 0.0, "grad_norm": 0.6171875, "kl": 0.023189717903733253, "learning_rate": 1.4485167512865972e-05, "loss": 0.0009, "num_tokens": 37636926.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 514.25, "completions/mean_terminated_length": 514.25, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.8339789706696181, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.03334027389064431, "learning_rate": 1.4482289474567975e-05, "loss": 0.0013, "num_tokens": 37646352.0, "reward": 1.866666555404663, "reward_std": 0.13213752210140228, "rewards/fixed_code_pass_all_test_reward/mean": 0.8666666746139526, "rewards/fixed_code_pass_all_test_reward/std": 0.1321374922990799, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 713.25, "completions/mean_terminated_length": 713.25, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.8341634384799852, "frac_reward_zero_std": 0.0, "grad_norm": 0.65234375, "kl": 0.0185983584378846, "learning_rate": 1.4479410971576013e-05, "loss": 0.0007, "num_tokens": 37659242.0, "reward": 1.5340908765792847, "reward_std": 0.498965859413147, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.49896588921546936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8343479062903524, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.041237954050302505, "learning_rate": 1.4476532004188512e-05, "loss": 0.0016, "num_tokens": 37669530.0, "reward": 1.564814805984497, "reward_std": 0.12001370638608932, "rewards/fixed_code_pass_all_test_reward/mean": 0.5648148059844971, "rewards/fixed_code_pass_all_test_reward/std": 0.12001372873783112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.8345323741007195, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.05169519118499011, "learning_rate": 1.447365257270394e-05, "loss": 0.0021, "num_tokens": 37674394.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8347168419110865, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.040332772536203265, "learning_rate": 1.447077267742082e-05, "loss": 0.0016, "num_tokens": 37681959.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8349013097214536, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.06222177390009165, "learning_rate": 1.446789231863772e-05, "loss": 0.0025, "num_tokens": 37691022.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8350857775318207, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.05941414460539818, "learning_rate": 1.4465011496653259e-05, "loss": 0.0024, "num_tokens": 37697548.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8352702453421877, "frac_reward_zero_std": 1.0, "grad_norm": 0.09619140625, "kl": 0.033833860885351896, "learning_rate": 1.4462130211766094e-05, "loss": 0.0014, "num_tokens": 37703305.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 472.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8354547131525549, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.06556448247283697, "learning_rate": 1.4459248464274944e-05, "loss": 0.0026, "num_tokens": 37712789.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 417.375, "completions/mean_terminated_length": 417.375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.835639180962922, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.033427080838009715, "learning_rate": 1.4456366254478569e-05, "loss": 0.0013, "num_tokens": 37721176.0, "reward": 1.8620131015777588, "reward_std": 0.11426407098770142, "rewards/fixed_code_pass_all_test_reward/mean": 0.8620129823684692, "rewards/fixed_code_pass_all_test_reward/std": 0.114264115691185, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 481.625, "completions/mean_terminated_length": 481.625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8358236487732891, "frac_reward_zero_std": 0.0, "grad_norm": 0.63671875, "kl": 0.014388896175660193, "learning_rate": 1.4453483582675775e-05, "loss": 0.0006, "num_tokens": 37729397.0, "reward": 1.6875, "reward_std": 0.6373774409294128, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.8360081165836561, "frac_reward_zero_std": 1.0, "grad_norm": 0.036865234375, "kl": 0.018829017062671483, "learning_rate": 1.4450600449165421e-05, "loss": 0.0008, "num_tokens": 37735873.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.8361925843940232, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03819097252562642, "learning_rate": 1.4447716854246408e-05, "loss": 0.0015, "num_tokens": 37740401.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8363770522043903, "frac_reward_zero_std": 0.0, "grad_norm": 0.76171875, "kl": 0.023557094973511994, "learning_rate": 1.444483279821769e-05, "loss": 0.0009, "num_tokens": 37747458.0, "reward": 1.8472222089767456, "reward_std": 0.25153848528862, "rewards/fixed_code_pass_all_test_reward/mean": 0.8472222089767456, "rewards/fixed_code_pass_all_test_reward/std": 0.25153848528862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.8365615200147575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.03345900750719011, "learning_rate": 1.4441948281378266e-05, "loss": 0.0013, "num_tokens": 37752433.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.8367459878251245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.055046494118869305, "learning_rate": 1.4439063304027183e-05, "loss": 0.0022, "num_tokens": 37763965.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8369304556354916, "frac_reward_zero_std": 1.0, "grad_norm": 0.2119140625, "kl": 0.08972206944599748, "learning_rate": 1.4436177866463537e-05, "loss": 0.0036, "num_tokens": 37768001.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8371149234458587, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.05695320665836334, "learning_rate": 1.4433291968986474e-05, "loss": 0.0023, "num_tokens": 37777186.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.8372993912562258, "frac_reward_zero_std": 1.0, "grad_norm": 0.046875, "kl": 0.034970608074218035, "learning_rate": 1.4430405611895177e-05, "loss": 0.0014, "num_tokens": 37784045.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.8374838590665928, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.04002674319781363, "learning_rate": 1.4427518795488891e-05, "loss": 0.0016, "num_tokens": 37791720.0, "reward": 1.9186046123504639, "reward_std": 0.09866604208946228, "rewards/fixed_code_pass_all_test_reward/mean": 0.9186046123504639, "rewards/fixed_code_pass_all_test_reward/std": 0.09866604208946228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 328.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.83766832687696, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.04765365784987807, "learning_rate": 1.4424631520066899e-05, "loss": 0.0019, "num_tokens": 37797298.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8378527946873271, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.051682702731341124, "learning_rate": 1.4421743785928536e-05, "loss": 0.0021, "num_tokens": 37803285.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 400.125, "completions/mean_terminated_length": 400.125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.8380372624976942, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.02960884536150843, "learning_rate": 1.4418855593373182e-05, "loss": 0.0012, "num_tokens": 37811630.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8382217303080612, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.06615860527381301, "learning_rate": 1.4415966942700266e-05, "loss": 0.0026, "num_tokens": 37820088.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8384061981184283, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.040938820922747254, "learning_rate": 1.4413077834209263e-05, "loss": 0.0016, "num_tokens": 37825431.0, "reward": 1.899999976158142, "reward_std": 0.2828426957130432, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 421.375, "completions/mean_terminated_length": 421.375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.8385906659287954, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.034580805571749806, "learning_rate": 1.4410188268199701e-05, "loss": 0.0014, "num_tokens": 37835954.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8387751337391626, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.1285075508058071, "learning_rate": 1.4407298244971144e-05, "loss": 0.0051, "num_tokens": 37840682.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.8389596015495296, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.022204914945177734, "learning_rate": 1.4404407764823216e-05, "loss": 0.0009, "num_tokens": 37847211.0, "reward": 1.379807710647583, "reward_std": 0.421042799949646, "rewards/fixed_code_pass_all_test_reward/mean": 0.379807710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.4210428297519684, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 357.75, "completions/mean_terminated_length": 357.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8391440693598967, "frac_reward_zero_std": 1.0, "grad_norm": 0.162109375, "kl": 0.06957817077636719, "learning_rate": 1.4401516828055579e-05, "loss": 0.0028, "num_tokens": 37856585.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.8393285371702638, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.03513953648507595, "learning_rate": 1.439862543496795e-05, "loss": 0.0014, "num_tokens": 37863846.0, "reward": 1.5714285373687744, "reward_std": 0.3211449980735779, "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285373687744, "rewards/fixed_code_pass_all_test_reward/std": 0.3211449980735779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8395130049806309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.03687436110340059, "learning_rate": 1.4395733585860088e-05, "loss": 0.0015, "num_tokens": 37876826.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 317.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.8396974727909979, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.037704933900386095, "learning_rate": 1.4392841281031796e-05, "loss": 0.0015, "num_tokens": 37883662.0, "reward": 1.52173912525177, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.52173912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 641.75, "completions/mean_terminated_length": 641.75, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.839881940601365, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.03241912368685007, "learning_rate": 1.4389948520782934e-05, "loss": 0.0013, "num_tokens": 37896684.0, "reward": 1.9848484992980957, "reward_std": 0.02805512771010399, "rewards/fixed_code_pass_all_test_reward/mean": 0.9848484992980957, "rewards/fixed_code_pass_all_test_reward/std": 0.028055155649781227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 440.625, "completions/mean_terminated_length": 440.625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8400664084117322, "frac_reward_zero_std": 1.0, "grad_norm": 0.05810546875, "kl": 0.03254800895228982, "learning_rate": 1.4387055305413406e-05, "loss": 0.0013, "num_tokens": 37905569.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 512.75, "completions/mean_terminated_length": 512.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8402508762220993, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.05436923401430249, "learning_rate": 1.4384161635223157e-05, "loss": 0.0022, "num_tokens": 37919575.0, "reward": 1.6726189851760864, "reward_std": 0.12582732737064362, "rewards/fixed_code_pass_all_test_reward/mean": 0.6726190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.125827357172966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8404353440324663, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.02542793843895197, "learning_rate": 1.4381267510512182e-05, "loss": 0.001, "num_tokens": 37928674.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8406198118428334, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.02108191279694438, "learning_rate": 1.4378372931580531e-05, "loss": 0.0008, "num_tokens": 37939228.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 635.375, "completions/mean_terminated_length": 635.375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.8408042796532005, "frac_reward_zero_std": 0.0, "grad_norm": 0.97265625, "kl": 0.02184521418530494, "learning_rate": 1.437547789872829e-05, "loss": 0.0009, "num_tokens": 37953463.0, "reward": 1.8888888359069824, "reward_std": 0.31426966190338135, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8409887474635676, "frac_reward_zero_std": 1.0, "grad_norm": 0.32421875, "kl": 0.08385562896728516, "learning_rate": 1.4372582412255599e-05, "loss": 0.0034, "num_tokens": 37957518.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.8411732152739347, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.05616301903501153, "learning_rate": 1.4369686472462639e-05, "loss": 0.0022, "num_tokens": 37967176.0, "reward": 1.724759578704834, "reward_std": 0.29354214668273926, "rewards/fixed_code_pass_all_test_reward/mean": 0.724759578704834, "rewards/fixed_code_pass_all_test_reward/std": 0.29354214668273926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 376.5, "completions/mean_terminated_length": 376.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8413576830843018, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.06292706169188023, "learning_rate": 1.4366790079649646e-05, "loss": 0.0025, "num_tokens": 37974908.0, "reward": 1.4632353782653809, "reward_std": 0.3392840325832367, "rewards/fixed_code_pass_all_test_reward/mean": 0.4632353186607361, "rewards/fixed_code_pass_all_test_reward/std": 0.33928412199020386, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8415421508946689, "frac_reward_zero_std": 1.0, "grad_norm": 2.15625, "kl": 0.05460579879581928, "learning_rate": 1.4363893234116897e-05, "loss": 0.0022, "num_tokens": 37980651.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.841726618705036, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.036561060696840286, "learning_rate": 1.4360995936164718e-05, "loss": 0.0015, "num_tokens": 37985879.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.841911086515403, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.048594933934509754, "learning_rate": 1.4358098186093481e-05, "loss": 0.0019, "num_tokens": 37994744.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8420955543257701, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.037993370671756566, "learning_rate": 1.4355199984203607e-05, "loss": 0.0015, "num_tokens": 38001868.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8422800221361373, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.03935114876367152, "learning_rate": 1.4352301330795562e-05, "loss": 0.0016, "num_tokens": 38007880.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 347.375, "completions/mean_terminated_length": 347.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.8424644899465044, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.04007950960658491, "learning_rate": 1.4349402226169856e-05, "loss": 0.0016, "num_tokens": 38015339.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8426489577568714, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.04747556382790208, "learning_rate": 1.434650267062705e-05, "loss": 0.0019, "num_tokens": 38020326.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 396.375, "completions/mean_terminated_length": 396.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8428334255672385, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.051590414019301534, "learning_rate": 1.4343602664467757e-05, "loss": 0.0021, "num_tokens": 38033769.0, "reward": 1.0178570747375488, "reward_std": 0.05050762742757797, "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8430178933776056, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.05273324251174927, "learning_rate": 1.434070220799262e-05, "loss": 0.0021, "num_tokens": 38042853.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 318.25, "completions/mean_terminated_length": 318.25, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8432023611879726, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05033519444987178, "learning_rate": 1.4337801301502348e-05, "loss": 0.002, "num_tokens": 38050951.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8433868289983398, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.03563853702507913, "learning_rate": 1.433489994529768e-05, "loss": 0.0014, "num_tokens": 38055340.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8435712968087069, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.06850841618143022, "learning_rate": 1.4331998139679416e-05, "loss": 0.0027, "num_tokens": 38059582.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.843755764619074, "frac_reward_zero_std": 1.0, "grad_norm": 0.044189453125, "kl": 0.04532699077390134, "learning_rate": 1.4329095884948394e-05, "loss": 0.0018, "num_tokens": 38069327.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 488.625, "completions/mean_terminated_length": 488.625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.843940232429441, "frac_reward_zero_std": 0.0, "grad_norm": 0.828125, "kl": 0.048503436613827944, "learning_rate": 1.4326193181405497e-05, "loss": 0.0019, "num_tokens": 38080540.0, "reward": 1.7083333730697632, "reward_std": 0.2136233150959015, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.2136233150959015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8441247002398081, "frac_reward_zero_std": 1.0, "grad_norm": 0.0703125, "kl": 0.051132958848029375, "learning_rate": 1.4323290029351662e-05, "loss": 0.002, "num_tokens": 38089535.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 422.875, "completions/mean_terminated_length": 422.875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.8443091680501752, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.03595753642730415, "learning_rate": 1.4320386429087868e-05, "loss": 0.0014, "num_tokens": 38098446.0, "reward": 1.559999942779541, "reward_std": 0.45544329285621643, "rewards/fixed_code_pass_all_test_reward/mean": 0.6850000023841858, "rewards/fixed_code_pass_all_test_reward/std": 0.33101576566696167, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 378.5, "completions/mean_terminated_length": 378.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8444936358605424, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.06097805546596646, "learning_rate": 1.431748238091514e-05, "loss": 0.0024, "num_tokens": 38109810.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8446781036709095, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.04442149959504604, "learning_rate": 1.4314577885134548e-05, "loss": 0.0018, "num_tokens": 38120070.0, "reward": 1.920212745666504, "reward_std": 0.13103443384170532, "rewards/fixed_code_pass_all_test_reward/mean": 0.9202127456665039, "rewards/fixed_code_pass_all_test_reward/std": 0.13103443384170532, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 437.625, "completions/mean_terminated_length": 437.625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.8448625714812765, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.020964991999790072, "learning_rate": 1.4311672942047214e-05, "loss": 0.0008, "num_tokens": 38128883.0, "reward": 1.808333396911621, "reward_std": 0.108012355864048, "rewards/fixed_code_pass_all_test_reward/mean": 0.8083333373069763, "rewards/fixed_code_pass_all_test_reward/std": 0.108012355864048, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.8450470392916436, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.042093463242053986, "learning_rate": 1.4308767551954306e-05, "loss": 0.0017, "num_tokens": 38133994.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8452315071020107, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.049778236891143024, "learning_rate": 1.4305861715157027e-05, "loss": 0.002, "num_tokens": 38142314.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 446.75, "completions/mean_terminated_length": 446.75, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.8454159749123777, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.03358162404038012, "learning_rate": 1.4302955431956642e-05, "loss": 0.0013, "num_tokens": 38153328.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.8456004427227449, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.028915792470797896, "learning_rate": 1.4300048702654455e-05, "loss": 0.0012, "num_tokens": 38160548.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.845784910533112, "frac_reward_zero_std": 0.0, "grad_norm": 3.59375, "kl": 0.03875794424675405, "learning_rate": 1.4297141527551813e-05, "loss": 0.0016, "num_tokens": 38164479.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8459693783434791, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03222684934735298, "learning_rate": 1.4294233906950113e-05, "loss": 0.0013, "num_tokens": 38172137.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 656.375, "completions/mean_terminated_length": 656.375, "completions/min_length": 528.0, "completions/min_terminated_length": 528.0, "epoch": 0.8461538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 0.72265625, "kl": 0.01860711502376944, "learning_rate": 1.4291325841150798e-05, "loss": 0.0007, "num_tokens": 38186540.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 436.875, "completions/mean_terminated_length": 436.875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.8463383139642132, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.0303362071281299, "learning_rate": 1.4288417330455358e-05, "loss": 0.0012, "num_tokens": 38194907.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8465227817745803, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.04035550489788875, "learning_rate": 1.4285508375165332e-05, "loss": 0.0016, "num_tokens": 38200969.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8467072495849475, "frac_reward_zero_std": 1.0, "grad_norm": 0.052001953125, "kl": 0.023657016921788454, "learning_rate": 1.428259897558229e-05, "loss": 0.0009, "num_tokens": 38205752.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8468917173953145, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.05741121247410774, "learning_rate": 1.427968913200787e-05, "loss": 0.0023, "num_tokens": 38213978.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 328.875, "completions/mean_terminated_length": 328.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8470761852056816, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.05841058096848428, "learning_rate": 1.4276778844743739e-05, "loss": 0.0023, "num_tokens": 38224353.0, "reward": 1.7750000953674316, "reward_std": 0.3284160792827606, "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.3284161388874054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8472606530160487, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.021851423487532884, "learning_rate": 1.4273868114091621e-05, "loss": 0.0009, "num_tokens": 38230656.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.8474451208264158, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.048348452895879745, "learning_rate": 1.4270956940353278e-05, "loss": 0.0019, "num_tokens": 38236677.0, "reward": 1.0760869979858398, "reward_std": 0.020126517862081528, "rewards/fixed_code_pass_all_test_reward/mean": 0.07608695328235626, "rewards/fixed_code_pass_all_test_reward/std": 0.020126523450016975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 463.875, "completions/mean_terminated_length": 463.875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.8476295886367828, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.027008824865333736, "learning_rate": 1.4268045323830519e-05, "loss": 0.0011, "num_tokens": 38246164.0, "reward": 1.6041667461395264, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.84781405644715, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.03559684753417969, "learning_rate": 1.4265133264825209e-05, "loss": 0.0014, "num_tokens": 38256231.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8479985242575171, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.03747866733465344, "learning_rate": 1.4262220763639244e-05, "loss": 0.0015, "num_tokens": 38262910.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8481829920678842, "frac_reward_zero_std": 1.0, "grad_norm": 0.031982421875, "kl": 0.01759509032126516, "learning_rate": 1.425930782057457e-05, "loss": 0.0007, "num_tokens": 38269866.0, "reward": 1.2222222089767456, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.2222222238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8483674598782512, "frac_reward_zero_std": 1.0, "grad_norm": 0.2138671875, "kl": 0.08471361128613353, "learning_rate": 1.425639443593319e-05, "loss": 0.0034, "num_tokens": 38274988.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8485519276886183, "frac_reward_zero_std": 1.0, "grad_norm": 0.03515625, "kl": 0.022619985742494464, "learning_rate": 1.425348061001714e-05, "loss": 0.0009, "num_tokens": 38281078.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8487363954989854, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.05580732366070151, "learning_rate": 1.4250566343128504e-05, "loss": 0.0022, "num_tokens": 38287570.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.8489208633093526, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.06061607296578586, "learning_rate": 1.4247651635569419e-05, "loss": 0.0024, "num_tokens": 38291333.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8491053311197196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.05291810049675405, "learning_rate": 1.424473648764206e-05, "loss": 0.0021, "num_tokens": 38300091.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 386.375, "completions/mean_terminated_length": 386.375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.8492897989300867, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.03605956328101456, "learning_rate": 1.4241820899648651e-05, "loss": 0.0014, "num_tokens": 38311030.0, "reward": 1.6607143878936768, "reward_std": 0.2850758135318756, "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.285075843334198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8494742667404538, "frac_reward_zero_std": 1.0, "grad_norm": 0.4296875, "kl": 0.05825456231832504, "learning_rate": 1.4238904871891456e-05, "loss": 0.0023, "num_tokens": 38314934.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8496587345508209, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.04353521764278412, "learning_rate": 1.4235988404672795e-05, "loss": 0.0017, "num_tokens": 38324697.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8498432023611879, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.049427412915974855, "learning_rate": 1.4233071498295026e-05, "loss": 0.002, "num_tokens": 38333280.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 507.375, "completions/mean_terminated_length": 507.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8500276701715551, "frac_reward_zero_std": 0.0, "grad_norm": 0.9765625, "kl": 0.025019120890647173, "learning_rate": 1.4230154153060555e-05, "loss": 0.001, "num_tokens": 38346723.0, "reward": 1.8304924964904785, "reward_std": 0.12414851039648056, "rewards/fixed_code_pass_all_test_reward/mean": 0.8304924368858337, "rewards/fixed_code_pass_all_test_reward/std": 0.12414851784706116, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 516.875, "completions/mean_terminated_length": 516.875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.8502121379819222, "frac_reward_zero_std": 0.0, "grad_norm": 0.66015625, "kl": 0.0345127556938678, "learning_rate": 1.4227236369271832e-05, "loss": 0.0014, "num_tokens": 38360930.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 245.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8503966057922893, "frac_reward_zero_std": 1.0, "grad_norm": 0.032470703125, "kl": 0.02621616458054632, "learning_rate": 1.4224318147231353e-05, "loss": 0.001, "num_tokens": 38367009.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8505810736026563, "frac_reward_zero_std": 1.0, "grad_norm": 0.04931640625, "kl": 0.01958166650729254, "learning_rate": 1.422139948724166e-05, "loss": 0.0008, "num_tokens": 38372972.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 523.5, "completions/mean_terminated_length": 523.5, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8507655414130234, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.058367958292365074, "learning_rate": 1.4218480389605345e-05, "loss": 0.0023, "num_tokens": 38388040.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8509500092233905, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.053669103886932135, "learning_rate": 1.421556085462503e-05, "loss": 0.0021, "num_tokens": 38392788.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 111.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.8511344770337577, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.0692718462087214, "learning_rate": 1.4212640882603406e-05, "loss": 0.0028, "num_tokens": 38396545.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8513189448441247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.04387856216635555, "learning_rate": 1.4209720473843187e-05, "loss": 0.0018, "num_tokens": 38406231.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8515034126544918, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.07112867990508676, "learning_rate": 1.4206799628647145e-05, "loss": 0.0028, "num_tokens": 38412398.0, "reward": 1.2430555820465088, "reward_std": 0.2684386670589447, "rewards/fixed_code_pass_all_test_reward/mean": 0.2430555671453476, "rewards/fixed_code_pass_all_test_reward/std": 0.2684386670589447, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8516878804648589, "frac_reward_zero_std": 1.0, "grad_norm": 0.197265625, "kl": 0.0843011059332639, "learning_rate": 1.420387834731809e-05, "loss": 0.0034, "num_tokens": 38417492.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.851872348275226, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.043110474944114685, "learning_rate": 1.4200956630158889e-05, "loss": 0.0017, "num_tokens": 38427757.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.852056816085593, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.03616967354901135, "learning_rate": 1.419803447747244e-05, "loss": 0.0014, "num_tokens": 38432036.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.8522412838959601, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.05243367259390652, "learning_rate": 1.4195111889561695e-05, "loss": 0.0021, "num_tokens": 38439162.0, "reward": 1.9107142686843872, "reward_std": 0.12518209218978882, "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, "rewards/fixed_code_pass_all_test_reward/std": 0.12518209218978882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 748.875, "completions/mean_terminated_length": 748.875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.8524257517063273, "frac_reward_zero_std": 0.0, "grad_norm": 0.6328125, "kl": 0.027471379144117236, "learning_rate": 1.4192188866729643e-05, "loss": 0.0011, "num_tokens": 38456049.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 416.875, "completions/mean_terminated_length": 416.875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.8526102195166944, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.040016493294388056, "learning_rate": 1.4189265409279331e-05, "loss": 0.0016, "num_tokens": 38465272.0, "reward": 1.859375, "reward_std": 0.08010874688625336, "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, "rewards/fixed_code_pass_all_test_reward/std": 0.08010874688625336, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 331.25, "completions/mean_terminated_length": 331.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8527946873270614, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.06577845476567745, "learning_rate": 1.418634151751384e-05, "loss": 0.0026, "num_tokens": 38471394.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 320.875, "completions/mean_terminated_length": 320.875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.8529791551374285, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.035824211314320564, "learning_rate": 1.4183417191736301e-05, "loss": 0.0014, "num_tokens": 38478681.0, "reward": 1.6875, "reward_std": 0.40510135889053345, "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, "rewards/fixed_code_pass_all_test_reward/std": 0.4051014184951782, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.8531636229477956, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.07212663046084344, "learning_rate": 1.4180492432249885e-05, "loss": 0.0029, "num_tokens": 38487563.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.8533480907581626, "frac_reward_zero_std": 1.0, "grad_norm": 0.181640625, "kl": 0.0662439635489136, "learning_rate": 1.4177567239357817e-05, "loss": 0.0026, "num_tokens": 38496142.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8535325585685298, "frac_reward_zero_std": 1.0, "grad_norm": 0.154296875, "kl": 0.058274308452382684, "learning_rate": 1.4174641613363358e-05, "loss": 0.0023, "num_tokens": 38504438.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8537170263788969, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.02619321981910616, "learning_rate": 1.4171715554569816e-05, "loss": 0.001, "num_tokens": 38510445.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.853901494189264, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.047740797279402614, "learning_rate": 1.4168789063280553e-05, "loss": 0.0019, "num_tokens": 38518008.0, "reward": 1.5612244606018066, "reward_std": 0.08307758718729019, "rewards/fixed_code_pass_all_test_reward/mean": 0.5612244606018066, "rewards/fixed_code_pass_all_test_reward/std": 0.08307760953903198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 384.625, "completions/mean_terminated_length": 384.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.854085961999631, "frac_reward_zero_std": 1.0, "grad_norm": 0.05615234375, "kl": 0.03802112769335508, "learning_rate": 1.4165862139798958e-05, "loss": 0.0015, "num_tokens": 38527293.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 418.875, "completions/mean_terminated_length": 418.875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8542704298099981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0289306640625, "kl": 0.034402198158204556, "learning_rate": 1.4162934784428484e-05, "loss": 0.0014, "num_tokens": 38535332.0, "reward": 1.7058823108673096, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823704719543, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8544548976203652, "frac_reward_zero_std": 1.0, "grad_norm": 0.1953125, "kl": 0.04122254904359579, "learning_rate": 1.416000699747261e-05, "loss": 0.0016, "num_tokens": 38545824.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8546393654307324, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.052680740132927895, "learning_rate": 1.415707877923488e-05, "loss": 0.0021, "num_tokens": 38555702.0, "reward": 1.105769157409668, "reward_std": 0.29916054010391235, "rewards/fixed_code_pass_all_test_reward/mean": 0.10576923191547394, "rewards/fixed_code_pass_all_test_reward/std": 0.29916059970855713, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 818.5, "completions/mean_terminated_length": 818.5, "completions/min_length": 630.0, "completions/min_terminated_length": 630.0, "epoch": 0.8548238332410995, "frac_reward_zero_std": 1.0, "grad_norm": 0.04296875, "kl": 0.0287223911145702, "learning_rate": 1.4154150130018867e-05, "loss": 0.0011, "num_tokens": 38569594.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 403.875, "completions/mean_terminated_length": 403.875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8550083010514665, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.06830438645556569, "learning_rate": 1.4151221050128193e-05, "loss": 0.0027, "num_tokens": 38580545.0, "reward": 1.1354167461395264, "reward_std": 0.1886538565158844, "rewards/fixed_code_pass_all_test_reward/mean": 0.1354166716337204, "rewards/fixed_code_pass_all_test_reward/std": 0.1886538714170456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8551927688618336, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.05095339729450643, "learning_rate": 1.4148291539866524e-05, "loss": 0.002, "num_tokens": 38590628.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 454.5, "completions/mean_terminated_length": 454.5, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.8553772366722007, "frac_reward_zero_std": 0.0, "grad_norm": 0.8984375, "kl": 0.032174097606912255, "learning_rate": 1.4145361599537577e-05, "loss": 0.0013, "num_tokens": 38599688.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 516.625, "completions/mean_terminated_length": 516.625, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.8555617044825677, "frac_reward_zero_std": 1.0, "grad_norm": 0.279296875, "kl": 0.059816672233864665, "learning_rate": 1.4142431229445106e-05, "loss": 0.0024, "num_tokens": 38611869.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8557461722929349, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.029067994095385075, "learning_rate": 1.4139500429892916e-05, "loss": 0.0012, "num_tokens": 38622955.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.855930640103302, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.052311800653114915, "learning_rate": 1.4136569201184844e-05, "loss": 0.0021, "num_tokens": 38629056.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8561151079136691, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.034833905403502285, "learning_rate": 1.413363754362479e-05, "loss": 0.0014, "num_tokens": 38637208.0, "reward": 1.8928570747375488, "reward_std": 0.30304577946662903, "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 401.5, "completions/mean_terminated_length": 401.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8562995757240361, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.03321625152602792, "learning_rate": 1.4130705457516683e-05, "loss": 0.0013, "num_tokens": 38645412.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 414.625, "completions/mean_terminated_length": 414.625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.8564840435344032, "frac_reward_zero_std": 0.0, "grad_norm": 0.73046875, "kl": 0.03590543591417372, "learning_rate": 1.4127772943164506e-05, "loss": 0.0014, "num_tokens": 38657305.0, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8566685113447703, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.029136902536265552, "learning_rate": 1.4124840000872275e-05, "loss": 0.0012, "num_tokens": 38667604.0, "reward": 1.6319444179534912, "reward_std": 0.404654860496521, "rewards/fixed_code_pass_all_test_reward/mean": 0.631944477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.40465492010116577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 459.5, "completions/mean_terminated_length": 459.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.8568529791551375, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.03150592965539545, "learning_rate": 1.4121906630944069e-05, "loss": 0.0013, "num_tokens": 38676152.0, "reward": 1.5113636255264282, "reward_std": 0.1607060581445694, "rewards/fixed_code_pass_all_test_reward/mean": 0.5113636255264282, "rewards/fixed_code_pass_all_test_reward/std": 0.16070608794689178, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 520.0, "completions/mean_terminated_length": 520.0, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.8570374469655045, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.01683203608263284, "learning_rate": 1.4118972833683993e-05, "loss": 0.0007, "num_tokens": 38685984.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8572219147758716, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.05156271671876311, "learning_rate": 1.4116038609396203e-05, "loss": 0.0021, "num_tokens": 38694559.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 406.5, "completions/mean_terminated_length": 406.5, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.8574063825862387, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.02123437717091292, "learning_rate": 1.4113103958384903e-05, "loss": 0.0008, "num_tokens": 38701595.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8575908503966058, "frac_reward_zero_std": 1.0, "grad_norm": 0.23046875, "kl": 0.05607993807643652, "learning_rate": 1.411016888095434e-05, "loss": 0.0022, "num_tokens": 38708430.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8577753182069728, "frac_reward_zero_std": 1.0, "grad_norm": 0.045654296875, "kl": 0.034819274209439754, "learning_rate": 1.4107233377408797e-05, "loss": 0.0014, "num_tokens": 38719305.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.85795978601734, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.07306134095415473, "learning_rate": 1.4104297448052612e-05, "loss": 0.0029, "num_tokens": 38728410.0, "reward": 1.4821428060531616, "reward_std": 0.2850758135318756, "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.2850758135318756, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.8581442538277071, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.015275871934136376, "learning_rate": 1.4101361093190162e-05, "loss": 0.0006, "num_tokens": 38732399.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8583287216380742, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.02554186386987567, "learning_rate": 1.409842431312587e-05, "loss": 0.001, "num_tokens": 38737668.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8585131894484412, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04166206298395991, "learning_rate": 1.40954871081642e-05, "loss": 0.0017, "num_tokens": 38745320.0, "reward": 1.9596774578094482, "reward_std": 0.0746629536151886, "rewards/fixed_code_pass_all_test_reward/mean": 0.9596773982048035, "rewards/fixed_code_pass_all_test_reward/std": 0.07466292381286621, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8586976572588083, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.0377315788064152, "learning_rate": 1.409254947860966e-05, "loss": 0.0015, "num_tokens": 38752653.0, "reward": 1.9956896305084229, "reward_std": 0.01219149399548769, "rewards/fixed_code_pass_all_test_reward/mean": 0.9956896305084229, "rewards/fixed_code_pass_all_test_reward/std": 0.012191502377390862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.8588821250691754, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.07363160327076912, "learning_rate": 1.4089611424766808e-05, "loss": 0.0029, "num_tokens": 38756933.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 625.375, "completions/mean_terminated_length": 625.375, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.8590665928795426, "frac_reward_zero_std": 0.0, "grad_norm": 0.921875, "kl": 0.04570416756905615, "learning_rate": 1.4086672946940238e-05, "loss": 0.0018, "num_tokens": 38770040.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8592510606899096, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.022240989143028855, "learning_rate": 1.4083734045434597e-05, "loss": 0.0009, "num_tokens": 38775549.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 428.5, "completions/mean_terminated_length": 428.5, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.8594355285002767, "frac_reward_zero_std": 0.0, "grad_norm": 0.859375, "kl": 0.020981371984817088, "learning_rate": 1.408079472055456e-05, "loss": 0.0008, "num_tokens": 38783249.0, "reward": 1.953125, "reward_std": 0.09300297498703003, "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, "rewards/fixed_code_pass_all_test_reward/std": 0.09300298243761063, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8596199963106438, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.03129088052082807, "learning_rate": 1.4077854972604872e-05, "loss": 0.0013, "num_tokens": 38789619.0, "reward": 1.6510417461395264, "reward_std": 0.21355074644088745, "rewards/fixed_code_pass_all_test_reward/mean": 0.6510416865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.21355074644088745, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 398.375, "completions/mean_terminated_length": 398.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8598044641210109, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.06676631979644299, "learning_rate": 1.4074914801890294e-05, "loss": 0.0027, "num_tokens": 38799774.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 437.0, "completions/mean_terminated_length": 437.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.8599889319313779, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.05240622255951166, "learning_rate": 1.4071974208715653e-05, "loss": 0.0021, "num_tokens": 38810334.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 507.75, "completions/mean_terminated_length": 507.75, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.8601733997417451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.03791411221027374, "learning_rate": 1.40690331933858e-05, "loss": 0.0015, "num_tokens": 38822228.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8603578675521122, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.0486620282754302, "learning_rate": 1.4066091756205646e-05, "loss": 0.0019, "num_tokens": 38827676.0, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8605423353624793, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.04177918133791536, "learning_rate": 1.4063149897480139e-05, "loss": 0.0017, "num_tokens": 38837497.0, "reward": 1.875, "reward_std": 0.2314550280570984, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8607268031728463, "frac_reward_zero_std": 1.0, "grad_norm": 0.20703125, "kl": 0.04682913247961551, "learning_rate": 1.4060207617514275e-05, "loss": 0.0019, "num_tokens": 38844182.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8609112709832134, "frac_reward_zero_std": 1.0, "grad_norm": 1.453125, "kl": 0.11341713066212833, "learning_rate": 1.4057264916613078e-05, "loss": 0.0045, "num_tokens": 38854027.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8610957387935805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.03464405215345323, "learning_rate": 1.4054321795081643e-05, "loss": 0.0014, "num_tokens": 38865557.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 405.625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.8612802066039477, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.048946703085675836, "learning_rate": 1.405137825322508e-05, "loss": 0.002, "num_tokens": 38875098.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8614646744143147, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.04572533001191914, "learning_rate": 1.4048434291348567e-05, "loss": 0.0018, "num_tokens": 38882053.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 396.0, "completions/mean_terminated_length": 396.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.8616491422246818, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.0394243742339313, "learning_rate": 1.4045489909757307e-05, "loss": 0.0016, "num_tokens": 38891677.0, "reward": 1.65625, "reward_std": 0.27294278144836426, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.27294281125068665, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 645.125, "completions/mean_terminated_length": 645.125, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.8618336100350489, "frac_reward_zero_std": 0.0, "grad_norm": 0.8515625, "kl": 0.022074261447414756, "learning_rate": 1.4042545108756558e-05, "loss": 0.0009, "num_tokens": 38908086.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 419.875, "completions/mean_terminated_length": 419.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.862018077845416, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.0434700867626816, "learning_rate": 1.4039599888651614e-05, "loss": 0.0017, "num_tokens": 38918125.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 369.875, "completions/mean_terminated_length": 369.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.862202545655783, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.03971857321448624, "learning_rate": 1.4036654249747817e-05, "loss": 0.0016, "num_tokens": 38925548.0, "reward": 1.9166667461395264, "reward_std": 0.23570223152637482, "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8623870134661502, "frac_reward_zero_std": 1.0, "grad_norm": 0.050048828125, "kl": 0.027675921679474413, "learning_rate": 1.403370819235055e-05, "loss": 0.0011, "num_tokens": 38929752.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.8625714812765173, "frac_reward_zero_std": 0.0, "grad_norm": 4.53125, "kl": 0.22531674487981945, "learning_rate": 1.4030761716765246e-05, "loss": 0.009, "num_tokens": 38935525.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8627559490868844, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.02901852980721742, "learning_rate": 1.402781482329737e-05, "loss": 0.0012, "num_tokens": 38944401.0, "reward": 1.2893518209457397, "reward_std": 0.5244312882423401, "rewards/fixed_code_pass_all_test_reward/mean": 0.41435185074806213, "rewards/fixed_code_pass_all_test_reward/std": 0.17788177728652954, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8629404168972514, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.026566158689092845, "learning_rate": 1.4024867512252438e-05, "loss": 0.0011, "num_tokens": 38951060.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 366.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8631248847076185, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.04402469424530864, "learning_rate": 1.4021919783936008e-05, "loss": 0.0018, "num_tokens": 38958223.0, "reward": 1.1770833730697632, "reward_std": 0.337910920381546, "rewards/fixed_code_pass_all_test_reward/mean": 0.1770833432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.33791089057922363, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8633093525179856, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.08293632697314024, "learning_rate": 1.4018971638653684e-05, "loss": 0.0033, "num_tokens": 38962124.0, "reward": 0.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 388.875, "completions/mean_terminated_length": 388.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.8634938203283528, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.0634770910255611, "learning_rate": 1.4016023076711106e-05, "loss": 0.0025, "num_tokens": 38971419.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8636782881387198, "frac_reward_zero_std": 1.0, "grad_norm": 0.041015625, "kl": 0.0472181998193264, "learning_rate": 1.4013074098413962e-05, "loss": 0.0019, "num_tokens": 38981288.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 448.25, "completions/mean_terminated_length": 448.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8638627559490869, "frac_reward_zero_std": 0.0, "grad_norm": 0.96484375, "kl": 0.029731591464951634, "learning_rate": 1.4010124704067983e-05, "loss": 0.0012, "num_tokens": 38989458.0, "reward": 1.5340909957885742, "reward_std": 0.3601700961589813, "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, "rewards/fixed_code_pass_all_test_reward/std": 0.3601701557636261, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 638.75, "completions/mean_terminated_length": 638.75, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.864047223759454, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.0362716494128108, "learning_rate": 1.4007174893978941e-05, "loss": 0.0015, "num_tokens": 39002464.0, "reward": 1.9696969985961914, "reward_std": 0.08570989966392517, "rewards/fixed_code_pass_all_test_reward/mean": 0.9696969985961914, "rewards/fixed_code_pass_all_test_reward/std": 0.08570991456508636, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 347.25, "completions/mean_terminated_length": 347.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.864231691569821, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.06427385751157999, "learning_rate": 1.4004224668452657e-05, "loss": 0.0026, "num_tokens": 39011818.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 458.0, "completions/mean_terminated_length": 458.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.8644161593801881, "frac_reward_zero_std": 0.0, "grad_norm": 0.69921875, "kl": 0.019109230604954064, "learning_rate": 1.4001274027794983e-05, "loss": 0.0008, "num_tokens": 39022530.0, "reward": 1.4409723281860352, "reward_std": 0.22588132321834564, "rewards/fixed_code_pass_all_test_reward/mean": 0.440972238779068, "rewards/fixed_code_pass_all_test_reward/std": 0.22588135302066803, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.8646006271905552, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.0596800297498703, "learning_rate": 1.399832297231183e-05, "loss": 0.0024, "num_tokens": 39026569.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 472.5, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.8647850950009224, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.029860817012377083, "learning_rate": 1.399537150230914e-05, "loss": 0.0012, "num_tokens": 39035477.0, "reward": 1.8125, "reward_std": 0.3482097089290619, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3482097089290619, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 408.75, "completions/mean_terminated_length": 408.75, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8649695628112894, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.03785516903735697, "learning_rate": 1.3992419618092903e-05, "loss": 0.0015, "num_tokens": 39043427.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8651540306216565, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.08576841419562697, "learning_rate": 1.398946731996915e-05, "loss": 0.0034, "num_tokens": 39047592.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 466.5, "completions/mean_terminated_length": 466.5, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.8653384984320236, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.03537092311307788, "learning_rate": 1.3986514608243957e-05, "loss": 0.0014, "num_tokens": 39059052.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.8655229662423907, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.04479034268297255, "learning_rate": 1.3983561483223438e-05, "loss": 0.0018, "num_tokens": 39062720.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.8657074340527577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458984375, "kl": 0.02328692434821278, "learning_rate": 1.3980607945213756e-05, "loss": 0.0009, "num_tokens": 39066979.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 565.625, "completions/mean_terminated_length": 565.625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.8658919018631249, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.0344025946687907, "learning_rate": 1.3977653994521112e-05, "loss": 0.0014, "num_tokens": 39077368.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.866076369673492, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.02852268482092768, "learning_rate": 1.3974699631451759e-05, "loss": 0.0011, "num_tokens": 39083498.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8662608374838591, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.023713955422863364, "learning_rate": 1.3971744856311975e-05, "loss": 0.0009, "num_tokens": 39087903.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 335.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8664453052942261, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.028498757048510015, "learning_rate": 1.3968789669408098e-05, "loss": 0.0011, "num_tokens": 39093815.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8666297731045932, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.04299801285378635, "learning_rate": 1.3965834071046502e-05, "loss": 0.0017, "num_tokens": 39102344.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8668142409149603, "frac_reward_zero_std": 1.0, "grad_norm": 0.0498046875, "kl": 0.031449089176021516, "learning_rate": 1.3962878061533602e-05, "loss": 0.0013, "num_tokens": 39111599.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 632.5, "completions/mean_terminated_length": 632.5, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.8669987087253275, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.0421741446480155, "learning_rate": 1.395992164117586e-05, "loss": 0.0017, "num_tokens": 39123179.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8671831765356945, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.02783536654897034, "learning_rate": 1.3956964810279775e-05, "loss": 0.0011, "num_tokens": 39131290.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.8673676443460616, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.02601873315870762, "learning_rate": 1.3954007569151893e-05, "loss": 0.001, "num_tokens": 39136316.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.8675521121564287, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.03543301299214363, "learning_rate": 1.39510499180988e-05, "loss": 0.0014, "num_tokens": 39145190.0, "reward": 1.8774752616882324, "reward_std": 0.346552312374115, "rewards/fixed_code_pass_all_test_reward/mean": 0.8774752616882324, "rewards/fixed_code_pass_all_test_reward/std": 0.3465523421764374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8677365799667958, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.031419944134540856, "learning_rate": 1.3948091857427126e-05, "loss": 0.0013, "num_tokens": 39150086.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8679210477771628, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.04398674704134464, "learning_rate": 1.3945133387443544e-05, "loss": 0.0018, "num_tokens": 39158563.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 591.625, "completions/mean_terminated_length": 591.625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.86810551558753, "frac_reward_zero_std": 0.0, "grad_norm": 0.671875, "kl": 0.015890662907622755, "learning_rate": 1.3942174508454768e-05, "loss": 0.0006, "num_tokens": 39168232.0, "reward": 1.9642857313156128, "reward_std": 0.10101523250341415, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.8682899833978971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.044706322136335075, "learning_rate": 1.3939215220767557e-05, "loss": 0.0018, "num_tokens": 39172218.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8684744512082642, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.04174162331037223, "learning_rate": 1.3936255524688707e-05, "loss": 0.0017, "num_tokens": 39181339.0, "reward": 1.2999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8686589190186312, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.06961822230368853, "learning_rate": 1.3933295420525059e-05, "loss": 0.0028, "num_tokens": 39185407.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 465.5, "completions/mean_terminated_length": 465.5, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.8688433868289983, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.0350196931976825, "learning_rate": 1.3930334908583498e-05, "loss": 0.0014, "num_tokens": 39198059.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 448.5, "completions/mean_terminated_length": 448.5, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.8690278546393654, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.0317831733264029, "learning_rate": 1.3927373989170955e-05, "loss": 0.0013, "num_tokens": 39210791.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8692123224497326, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.03357195947319269, "learning_rate": 1.392441266259439e-05, "loss": 0.0013, "num_tokens": 39223720.0, "reward": 1.5663264989852905, "reward_std": 0.10101527720689774, "rewards/fixed_code_pass_all_test_reward/mean": 0.5663264989852905, "rewards/fixed_code_pass_all_test_reward/std": 0.10101527720689774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8693967902600996, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.03081578010460362, "learning_rate": 1.392145092916082e-05, "loss": 0.0012, "num_tokens": 39228857.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.8695812580704667, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.05426775990054011, "learning_rate": 1.3918488789177298e-05, "loss": 0.0022, "num_tokens": 39232980.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8697657258808338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.058262263890355825, "learning_rate": 1.3915526242950915e-05, "loss": 0.0023, "num_tokens": 39240370.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8699501936912009, "frac_reward_zero_std": 1.0, "grad_norm": 0.058837890625, "kl": 0.030521248700097203, "learning_rate": 1.3912563290788808e-05, "loss": 0.0012, "num_tokens": 39246613.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8701346615015679, "frac_reward_zero_std": 1.0, "grad_norm": 0.65234375, "kl": 0.05464904918335378, "learning_rate": 1.3909599932998159e-05, "loss": 0.0022, "num_tokens": 39251112.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8703191293119351, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.06137406104244292, "learning_rate": 1.390663616988619e-05, "loss": 0.0025, "num_tokens": 39255130.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8705035971223022, "frac_reward_zero_std": 1.0, "grad_norm": 0.057861328125, "kl": 0.030905081424862146, "learning_rate": 1.390367200176016e-05, "loss": 0.0012, "num_tokens": 39263522.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 508.125, "completions/mean_terminated_length": 508.125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.8706880649326693, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.02643391815945506, "learning_rate": 1.3900707428927376e-05, "loss": 0.0011, "num_tokens": 39272891.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8708725327430363, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.037322524236515164, "learning_rate": 1.3897742451695187e-05, "loss": 0.0015, "num_tokens": 39280091.0, "reward": 1.9017857313156128, "reward_std": 0.18185748159885406, "rewards/fixed_code_pass_all_test_reward/mean": 0.9017857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.18185752630233765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 488.0, "completions/mean_terminated_length": 265.14288330078125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8710570005534034, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.053849385818466544, "learning_rate": 1.3894777070370984e-05, "loss": 0.0022, "num_tokens": 39287483.0, "reward": 1.625, "reward_std": 0.7440237998962402, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.8712414683637705, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.06663461215794086, "learning_rate": 1.389181128526219e-05, "loss": 0.0027, "num_tokens": 39291323.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 685.25, "completions/mean_terminated_length": 685.25, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.8714259361741377, "frac_reward_zero_std": 1.0, "grad_norm": 0.2890625, "kl": 0.0304770766524598, "learning_rate": 1.3888845096676286e-05, "loss": 0.0012, "num_tokens": 39303389.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.8716104039845047, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.0554989711381495, "learning_rate": 1.3885878504920785e-05, "loss": 0.0022, "num_tokens": 39309547.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8717948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.047845219960436225, "learning_rate": 1.3882911510303241e-05, "loss": 0.0019, "num_tokens": 39314309.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8719793396052389, "frac_reward_zero_std": 1.0, "grad_norm": 0.1123046875, "kl": 0.05718048848211765, "learning_rate": 1.3879944113131251e-05, "loss": 0.0023, "num_tokens": 39320164.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.872163807415606, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.052743843058124185, "learning_rate": 1.3876976313712457e-05, "loss": 0.0021, "num_tokens": 39327476.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.872348275225973, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.021337245707400143, "learning_rate": 1.3874008112354545e-05, "loss": 0.0009, "num_tokens": 39334034.0, "reward": 1.899999976158142, "reward_std": 0.18516401946544647, "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.18516401946544647, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8725327430363402, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.055611724965274334, "learning_rate": 1.3871039509365235e-05, "loss": 0.0022, "num_tokens": 39341750.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8727172108467073, "frac_reward_zero_std": 1.0, "grad_norm": 0.79296875, "kl": 0.06065567955374718, "learning_rate": 1.3868070505052287e-05, "loss": 0.0024, "num_tokens": 39351071.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 403.375, "completions/mean_terminated_length": 403.375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.8729016786570744, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.027958056307397783, "learning_rate": 1.3865101099723515e-05, "loss": 0.0011, "num_tokens": 39361082.0, "reward": 1.9765625, "reward_std": 0.06629125773906708, "rewards/fixed_code_pass_all_test_reward/mean": 0.9765625, "rewards/fixed_code_pass_all_test_reward/std": 0.06629125773906708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.8730861464674414, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.03316507791168988, "learning_rate": 1.3862131293686762e-05, "loss": 0.0013, "num_tokens": 39369120.0, "reward": 1.6488094329833984, "reward_std": 0.45882448554039, "rewards/fixed_code_pass_all_test_reward/mean": 0.648809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.4588245153427124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 382.875, "completions/mean_terminated_length": 382.875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.8732706142778085, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.036426066188141704, "learning_rate": 1.3859161087249924e-05, "loss": 0.0015, "num_tokens": 39379863.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8734550820881756, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.04740915121510625, "learning_rate": 1.3856190480720926e-05, "loss": 0.0019, "num_tokens": 39386615.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8736395498985428, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.035562718054279685, "learning_rate": 1.3853219474407741e-05, "loss": 0.0014, "num_tokens": 39392177.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 403.625, "completions/mean_terminated_length": 403.625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8738240177089098, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421875, "kl": 0.02128627779893577, "learning_rate": 1.3850248068618388e-05, "loss": 0.0009, "num_tokens": 39400214.0, "reward": 1.3409090042114258, "reward_std": 0.09409989416599274, "rewards/fixed_code_pass_all_test_reward/mean": 0.34090909361839294, "rewards/fixed_code_pass_all_test_reward/std": 0.09409984946250916, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 577.25, "completions/mean_terminated_length": 577.25, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.8740084855192769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0224609375, "kl": 0.012722728541120887, "learning_rate": 1.3847276263660922e-05, "loss": 0.0005, "num_tokens": 39410264.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 386.875, "completions/mean_terminated_length": 386.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.874192953329644, "frac_reward_zero_std": 1.0, "grad_norm": 0.875, "kl": 0.0531512814341113, "learning_rate": 1.3844304059843435e-05, "loss": 0.0021, "num_tokens": 39419263.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 428.375, "completions/mean_terminated_length": 428.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.874377421140011, "frac_reward_zero_std": 1.0, "grad_norm": 0.04443359375, "kl": 0.024918338982388377, "learning_rate": 1.3841331457474067e-05, "loss": 0.001, "num_tokens": 39429994.0, "reward": 1.7469879388809204, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.7469879388809204, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.8745618889503781, "frac_reward_zero_std": 1.0, "grad_norm": 0.039794921875, "kl": 0.03135993366595358, "learning_rate": 1.3838358456861e-05, "loss": 0.0013, "num_tokens": 39437430.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.8747463567607453, "frac_reward_zero_std": 1.0, "grad_norm": 0.2158203125, "kl": 0.05347280763089657, "learning_rate": 1.3835385058312456e-05, "loss": 0.0021, "num_tokens": 39442878.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8749308245711124, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.03216646471992135, "learning_rate": 1.3832411262136692e-05, "loss": 0.0013, "num_tokens": 39452073.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8751152923814794, "frac_reward_zero_std": 1.0, "grad_norm": 0.05419921875, "kl": 0.02850513276644051, "learning_rate": 1.3829437068642013e-05, "loss": 0.0011, "num_tokens": 39456783.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 391.0, "completions/mean_terminated_length": 391.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8752997601918465, "frac_reward_zero_std": 0.0, "grad_norm": 0.93359375, "kl": 0.06560150324366987, "learning_rate": 1.3826462478136768e-05, "loss": 0.0026, "num_tokens": 39467023.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 484.625, "completions/mean_terminated_length": 484.625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8754842280022136, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.015738442889414728, "learning_rate": 1.382348749092934e-05, "loss": 0.0006, "num_tokens": 39480700.0, "reward": 1.9276859760284424, "reward_std": 0.01753157190978527, "rewards/fixed_code_pass_all_test_reward/mean": 0.9276859760284424, "rewards/fixed_code_pass_all_test_reward/std": 0.01753157190978527, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8756686958125807, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.04961149860173464, "learning_rate": 1.3820512107328152e-05, "loss": 0.002, "num_tokens": 39489542.0, "reward": 1.1508620977401733, "reward_std": 0.3506588041782379, "rewards/fixed_code_pass_all_test_reward/mean": 0.15086206793785095, "rewards/fixed_code_pass_all_test_reward/std": 0.3506588339805603, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 463.875, "completions/mean_terminated_length": 463.875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.8758531636229479, "frac_reward_zero_std": 0.0, "grad_norm": 0.90234375, "kl": 0.028414310072548687, "learning_rate": 1.3817536327641678e-05, "loss": 0.0011, "num_tokens": 39500637.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 332.0, "completions/mean_terminated_length": 332.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8760376314333149, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.013821726315654814, "learning_rate": 1.3814560152178426e-05, "loss": 0.0006, "num_tokens": 39507445.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1711.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 789.5, "completions/mean_terminated_length": 789.5, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.876222099243682, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.03998230630531907, "learning_rate": 1.3811583581246941e-05, "loss": 0.0016, "num_tokens": 39520825.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8764065670540491, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.0433138650842011, "learning_rate": 1.3808606615155821e-05, "loss": 0.0017, "num_tokens": 39533897.0, "reward": 1.648809552192688, "reward_std": 0.11921755224466324, "rewards/fixed_code_pass_all_test_reward/mean": 0.648809552192688, "rewards/fixed_code_pass_all_test_reward/std": 0.11921756714582443, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8765910348644161, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.041911221109330654, "learning_rate": 1.3805629254213693e-05, "loss": 0.0017, "num_tokens": 39541180.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 261.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8767755026747832, "frac_reward_zero_std": 1.0, "grad_norm": 0.041748046875, "kl": 0.03478107205592096, "learning_rate": 1.3802651498729234e-05, "loss": 0.0014, "num_tokens": 39550479.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 422.5, "completions/mean_terminated_length": 422.5, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.8769599704851503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.04243451589718461, "learning_rate": 1.3799673349011153e-05, "loss": 0.0017, "num_tokens": 39560971.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 417.75, "completions/mean_terminated_length": 417.75, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.8771444382955175, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.027786601800471544, "learning_rate": 1.379669480536821e-05, "loss": 0.0011, "num_tokens": 39573289.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8773289061058845, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.04395279474556446, "learning_rate": 1.3793715868109195e-05, "loss": 0.0018, "num_tokens": 39578860.0, "reward": 1.8985848426818848, "reward_std": 0.18778426945209503, "rewards/fixed_code_pass_all_test_reward/mean": 0.8985849022865295, "rewards/fixed_code_pass_all_test_reward/std": 0.18778428435325623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 414.875, "completions/mean_terminated_length": 414.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8775133739162516, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.04496568674221635, "learning_rate": 1.3790736537542948e-05, "loss": 0.0018, "num_tokens": 39588171.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 456.375, "completions/mean_terminated_length": 456.375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8776978417266187, "frac_reward_zero_std": 0.0, "grad_norm": 0.71875, "kl": 0.02382062072865665, "learning_rate": 1.3787756813978349e-05, "loss": 0.001, "num_tokens": 39597534.0, "reward": 1.399999976158142, "reward_std": 0.25657081604003906, "rewards/fixed_code_pass_all_test_reward/mean": 0.3999999761581421, "rewards/fixed_code_pass_all_test_reward/std": 0.25657081604003906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8778823095369858, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.039692720863968134, "learning_rate": 1.3784776697724307e-05, "loss": 0.0016, "num_tokens": 39602288.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8780667773473528, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.03313214611262083, "learning_rate": 1.3781796189089788e-05, "loss": 0.0013, "num_tokens": 39612845.0, "reward": 1.974662184715271, "reward_std": 0.07166623324155807, "rewards/fixed_code_pass_all_test_reward/mean": 0.974662184715271, "rewards/fixed_code_pass_all_test_reward/std": 0.07166622579097748, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.87825124515772, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.04306380683556199, "learning_rate": 1.3778815288383794e-05, "loss": 0.0017, "num_tokens": 39623324.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8784357129680871, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.04802516894415021, "learning_rate": 1.3775833995915356e-05, "loss": 0.0019, "num_tokens": 39630110.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8786201807784542, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.037016161950305104, "learning_rate": 1.3772852311993561e-05, "loss": 0.0015, "num_tokens": 39639563.0, "reward": 1.7999999523162842, "reward_std": 0.38544961810112, "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8788046485888212, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.08206583792343736, "learning_rate": 1.3769870236927526e-05, "loss": 0.0033, "num_tokens": 39648440.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.8789891163991883, "frac_reward_zero_std": 1.0, "grad_norm": 0.04052734375, "kl": 0.02462981583084911, "learning_rate": 1.3766887771026417e-05, "loss": 0.001, "num_tokens": 39664085.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8791735842095554, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.07272317353636026, "learning_rate": 1.3763904914599434e-05, "loss": 0.0029, "num_tokens": 39668172.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 350.875, "completions/mean_terminated_length": 350.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8793580520199226, "frac_reward_zero_std": 1.0, "grad_norm": 0.046630859375, "kl": 0.03064386628102511, "learning_rate": 1.3760921667955818e-05, "loss": 0.0012, "num_tokens": 39674635.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8795425198302896, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.04086106293834746, "learning_rate": 1.3757938031404856e-05, "loss": 0.0016, "num_tokens": 39680859.0, "reward": 1.0225000381469727, "reward_std": 0.036154430359601974, "rewards/fixed_code_pass_all_test_reward/mean": 0.022499999031424522, "rewards/fixed_code_pass_all_test_reward/std": 0.036154430359601974, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 95.0, "completions/mean_terminated_length": 95.0, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8797269876406567, "frac_reward_zero_std": 1.0, "grad_norm": 0.052978515625, "kl": 0.018019924114923924, "learning_rate": 1.3754954005255869e-05, "loss": 0.0007, "num_tokens": 39684315.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 392.125, "completions/mean_terminated_length": 392.125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8799114554510238, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.04209405032452196, "learning_rate": 1.3751969589818221e-05, "loss": 0.0017, "num_tokens": 39692124.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8800959232613909, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.0858722566626966, "learning_rate": 1.3748984785401318e-05, "loss": 0.0034, "num_tokens": 39695657.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 441.625, "completions/mean_terminated_length": 441.625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.8802803910717579, "frac_reward_zero_std": 1.0, "grad_norm": 0.04833984375, "kl": 0.02058216172736138, "learning_rate": 1.3745999592314605e-05, "loss": 0.0008, "num_tokens": 39704054.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 288.25, "completions/mean_terminated_length": 288.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8804648588821251, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.03259724169038236, "learning_rate": 1.3743014010867564e-05, "loss": 0.0013, "num_tokens": 39710944.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.5175492167472839, "step": 4773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8806493266924922, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.04258326534181833, "learning_rate": 1.374002804136972e-05, "loss": 0.0017, "num_tokens": 39719351.0, "reward": 1.3161765336990356, "reward_std": 0.42989787459373474, "rewards/fixed_code_pass_all_test_reward/mean": 0.31617647409439087, "rewards/fixed_code_pass_all_test_reward/std": 0.42989784479141235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.8808337945028593, "frac_reward_zero_std": 1.0, "grad_norm": 0.056640625, "kl": 0.0331149403937161, "learning_rate": 1.373704168413064e-05, "loss": 0.0013, "num_tokens": 39723864.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 217.625, "completions/mean_terminated_length": 217.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.8810182623132263, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.049930619308725, "learning_rate": 1.3734054939459936e-05, "loss": 0.002, "num_tokens": 39729485.0, "reward": 1.5499999523162842, "reward_std": 0.3726353943347931, "rewards/fixed_code_pass_all_test_reward/mean": 0.5499999523162842, "rewards/fixed_code_pass_all_test_reward/std": 0.3726354241371155, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 411.25, "completions/mean_terminated_length": 411.25, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8812027301235934, "frac_reward_zero_std": 0.0, "grad_norm": 0.349609375, "kl": 0.03233512002043426, "learning_rate": 1.3731067807667243e-05, "loss": 0.0013, "num_tokens": 39741823.0, "reward": 1.649193525314331, "reward_std": 0.2623138129711151, "rewards/fixed_code_pass_all_test_reward/mean": 0.649193525314331, "rewards/fixed_code_pass_all_test_reward/std": 0.2623138129711151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8813871979339605, "frac_reward_zero_std": 1.0, "grad_norm": 0.1875, "kl": 0.06477592792361975, "learning_rate": 1.3728080289062252e-05, "loss": 0.0026, "num_tokens": 39745873.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8815716657443277, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.035850972635671496, "learning_rate": 1.3725092383954688e-05, "loss": 0.0014, "num_tokens": 39754861.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8817561335546947, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.09666113369166851, "learning_rate": 1.3722104092654318e-05, "loss": 0.0039, "num_tokens": 39760270.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8819406013650618, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.06950349593535066, "learning_rate": 1.371911541547095e-05, "loss": 0.0028, "num_tokens": 39769475.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8821250691754289, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.077410111669451, "learning_rate": 1.3716126352714428e-05, "loss": 0.0031, "num_tokens": 39773777.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 1076.125, "completions/mean_terminated_length": 1076.125, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.882309536985796, "frac_reward_zero_std": 0.0, "grad_norm": 0.384765625, "kl": 0.014730526134371758, "learning_rate": 1.3713136904694637e-05, "loss": 0.0006, "num_tokens": 39794506.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.882494004796163, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.057129884604364634, "learning_rate": 1.3710147071721505e-05, "loss": 0.0023, "num_tokens": 39800432.0, "reward": 1.91847825050354, "reward_std": 0.23057830333709717, "rewards/fixed_code_pass_all_test_reward/mean": 0.91847825050354, "rewards/fixed_code_pass_all_test_reward/std": 0.23057828843593597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8826784726065302, "frac_reward_zero_std": 1.0, "grad_norm": 0.03759765625, "kl": 0.019371931557543576, "learning_rate": 1.3707156854104998e-05, "loss": 0.0008, "num_tokens": 39805837.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8828629404168973, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.04092761117499322, "learning_rate": 1.370416625215512e-05, "loss": 0.0016, "num_tokens": 39812284.0, "reward": 1.370192289352417, "reward_std": 0.17915162444114685, "rewards/fixed_code_pass_all_test_reward/mean": 0.370192289352417, "rewards/fixed_code_pass_all_test_reward/std": 0.17915163934230804, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8830474082272644, "frac_reward_zero_std": 1.0, "grad_norm": 0.23828125, "kl": 0.052423154236748815, "learning_rate": 1.3701175266181918e-05, "loss": 0.0021, "num_tokens": 39820672.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8832318760376314, "frac_reward_zero_std": 1.0, "grad_norm": 0.05517578125, "kl": 0.03279749350622296, "learning_rate": 1.3698183896495483e-05, "loss": 0.0013, "num_tokens": 39830064.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8834163438479985, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.04007199499756098, "learning_rate": 1.369519214340593e-05, "loss": 0.0016, "num_tokens": 39834427.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 473.375, "completions/mean_terminated_length": 473.375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8836008116583656, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.048756099538877606, "learning_rate": 1.3692200007223428e-05, "loss": 0.002, "num_tokens": 39848150.0, "reward": 1.5535714626312256, "reward_std": 0.4056113660335541, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.09438391774892807, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8837852794687328, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.03142055100761354, "learning_rate": 1.3689207488258185e-05, "loss": 0.0013, "num_tokens": 39857261.0, "reward": 1.668269157409668, "reward_std": 0.4578319489955902, "rewards/fixed_code_pass_all_test_reward/mean": 0.6682692170143127, "rewards/fixed_code_pass_all_test_reward/std": 0.4578319489955902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8839697472790998, "frac_reward_zero_std": 0.0, "grad_norm": 0.86328125, "kl": 0.019864696776494384, "learning_rate": 1.3686214586820443e-05, "loss": 0.0008, "num_tokens": 39865384.0, "reward": 1.8125, "reward_std": 0.3471825420856476, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 475.125, "completions/mean_terminated_length": 475.125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.8841542150894669, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.03055708098690957, "learning_rate": 1.3683221303220486e-05, "loss": 0.0012, "num_tokens": 39879409.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.884338682899834, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.05331244086846709, "learning_rate": 1.368022763776864e-05, "loss": 0.0021, "num_tokens": 39884410.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.884523150710201, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.05202515935525298, "learning_rate": 1.3677233590775262e-05, "loss": 0.0021, "num_tokens": 39888121.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 438.25, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.8847076185205681, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.02664763364009559, "learning_rate": 1.3674239162550764e-05, "loss": 0.0011, "num_tokens": 39896971.0, "reward": 1.7083332538604736, "reward_std": 0.3032888174057007, "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.30328884720802307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8848920863309353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.03534804133232683, "learning_rate": 1.3671244353405582e-05, "loss": 0.0014, "num_tokens": 39901801.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8850765541413024, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.041021901182830334, "learning_rate": 1.3668249163650197e-05, "loss": 0.0016, "num_tokens": 39905934.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8852610219516694, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.04544982104562223, "learning_rate": 1.3665253593595135e-05, "loss": 0.0018, "num_tokens": 39912107.0, "reward": 1.485576868057251, "reward_std": 0.04079460725188255, "rewards/fixed_code_pass_all_test_reward/mean": 0.48557692766189575, "rewards/fixed_code_pass_all_test_reward/std": 0.04079461842775345, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8854454897620365, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.031893985520582646, "learning_rate": 1.3662257643550958e-05, "loss": 0.0013, "num_tokens": 39917032.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8856299575724036, "frac_reward_zero_std": 0.0, "grad_norm": 0.75, "kl": 0.026645993581041694, "learning_rate": 1.365926131382826e-05, "loss": 0.0011, "num_tokens": 39925765.0, "reward": 1.9880952835083008, "reward_std": 0.022043362259864807, "rewards/fixed_code_pass_all_test_reward/mean": 0.988095223903656, "rewards/fixed_code_pass_all_test_reward/std": 0.02204333432018757, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8858144253827707, "frac_reward_zero_std": 1.0, "grad_norm": 0.240234375, "kl": 0.12777129234746099, "learning_rate": 1.3656264604737683e-05, "loss": 0.0051, "num_tokens": 39933828.0, "reward": 1.9642857313156128, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 423.625, "completions/mean_terminated_length": 423.625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.8859988931931378, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.034718929207883775, "learning_rate": 1.3653267516589909e-05, "loss": 0.0014, "num_tokens": 39944377.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 394.25, "completions/mean_terminated_length": 394.25, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8861833610035049, "frac_reward_zero_std": 0.0, "grad_norm": 0.87109375, "kl": 0.03359021758660674, "learning_rate": 1.365027004969565e-05, "loss": 0.0013, "num_tokens": 39951875.0, "reward": 1.625, "reward_std": 0.6697873473167419, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.3874864876270294, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.886367828813872, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.04825064749456942, "learning_rate": 1.364727220436567e-05, "loss": 0.0019, "num_tokens": 39960251.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8865522966242391, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.053420227486640215, "learning_rate": 1.364427398091076e-05, "loss": 0.0021, "num_tokens": 39974152.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 501.0, "completions/mean_terminated_length": 501.0, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.8867367644346061, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.0364383899141103, "learning_rate": 1.364127537964176e-05, "loss": 0.0015, "num_tokens": 39983392.0, "reward": 1.2083333730697632, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.8869212322449732, "frac_reward_zero_std": 1.0, "grad_norm": 0.040283203125, "kl": 0.018094782019034028, "learning_rate": 1.3638276400869544e-05, "loss": 0.0007, "num_tokens": 39994471.0, "reward": 1.5056179761886597, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5056179761886597, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8871057000553404, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.0380824850872159, "learning_rate": 1.3635277044905025e-05, "loss": 0.0015, "num_tokens": 40003859.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 1088.875, "completions/mean_terminated_length": 1088.875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 0.8872901678657075, "frac_reward_zero_std": 0.0, "grad_norm": 0.52734375, "kl": 0.01806294033303857, "learning_rate": 1.3632277312059157e-05, "loss": 0.0007, "num_tokens": 40024698.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8874746356760745, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.03752576420083642, "learning_rate": 1.3629277202642931e-05, "loss": 0.0015, "num_tokens": 40028468.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8876591034864416, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.04492254299111664, "learning_rate": 1.3626276716967382e-05, "loss": 0.0018, "num_tokens": 40037122.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 483.0, "completions/mean_terminated_length": 483.0, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8878435712968087, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.0470244197640568, "learning_rate": 1.3623275855343576e-05, "loss": 0.0019, "num_tokens": 40049474.0, "reward": 1.5, "reward_std": 0.0890870913863182, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0890870913863182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 355.875, "completions/mean_terminated_length": 355.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8880280391071758, "frac_reward_zero_std": 1.0, "grad_norm": 0.03662109375, "kl": 0.02145544334780425, "learning_rate": 1.3620274618082628e-05, "loss": 0.0009, "num_tokens": 40059345.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8882125069175429, "frac_reward_zero_std": 1.0, "grad_norm": 0.046142578125, "kl": 0.03213256527669728, "learning_rate": 1.3617273005495682e-05, "loss": 0.0013, "num_tokens": 40067396.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.88839697472791, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.027460610144771636, "learning_rate": 1.361427101789392e-05, "loss": 0.0011, "num_tokens": 40072855.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 480.625, "completions/mean_terminated_length": 480.625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8885814425382771, "frac_reward_zero_std": 0.0, "grad_norm": 0.63671875, "kl": 0.06665262184105814, "learning_rate": 1.361126865558858e-05, "loss": 0.0027, "num_tokens": 40082108.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8887659103486442, "frac_reward_zero_std": 1.0, "grad_norm": 0.2314453125, "kl": 0.043979566777125, "learning_rate": 1.3608265918890919e-05, "loss": 0.0018, "num_tokens": 40087439.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.8889503781590112, "frac_reward_zero_std": 0.0, "grad_norm": 1.0078125, "kl": 0.02769853570498526, "learning_rate": 1.3605262808112247e-05, "loss": 0.0011, "num_tokens": 40098815.0, "reward": 1.9076087474822998, "reward_std": 0.1745910942554474, "rewards/fixed_code_pass_all_test_reward/mean": 0.907608687877655, "rewards/fixed_code_pass_all_test_reward/std": 0.1745910793542862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 356.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8891348459693783, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.02794547099620104, "learning_rate": 1.3602259323563895e-05, "loss": 0.0011, "num_tokens": 40104498.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8893193137797454, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.0868347748182714, "learning_rate": 1.3599255465557257e-05, "loss": 0.0035, "num_tokens": 40108385.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8895037815901126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0810546875, "kl": 0.027131236158311367, "learning_rate": 1.3596251234403747e-05, "loss": 0.0011, "num_tokens": 40118082.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8896882494004796, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0660396353341639, "learning_rate": 1.3593246630414826e-05, "loss": 0.0026, "num_tokens": 40121887.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8898727172108467, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.06254475214518607, "learning_rate": 1.3590241653901986e-05, "loss": 0.0025, "num_tokens": 40126198.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8900571850212138, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.05223834654316306, "learning_rate": 1.358723630517677e-05, "loss": 0.0021, "num_tokens": 40134629.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8902416528315809, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.021204240154474974, "learning_rate": 1.3584230584550749e-05, "loss": 0.0008, "num_tokens": 40138752.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 376.75, "completions/mean_terminated_length": 376.75, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.8904261206419479, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.14607433998025954, "learning_rate": 1.3581224492335536e-05, "loss": 0.0058, "num_tokens": 40150014.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8906105884523151, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.05222631571814418, "learning_rate": 1.3578218028842782e-05, "loss": 0.0021, "num_tokens": 40155580.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8907950562626822, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.04727358045056462, "learning_rate": 1.3575211194384182e-05, "loss": 0.0019, "num_tokens": 40164237.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8909795240730493, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.031919239554554224, "learning_rate": 1.357220398927146e-05, "loss": 0.0013, "num_tokens": 40171148.0, "reward": 1.9318182468414307, "reward_std": 0.15932266414165497, "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181872367859, "rewards/fixed_code_pass_all_test_reward/std": 0.15932264924049377, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8911639918834163, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.033594183158129454, "learning_rate": 1.356919641381638e-05, "loss": 0.0013, "num_tokens": 40181055.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8913484596937834, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.036877939477562904, "learning_rate": 1.3566188468330754e-05, "loss": 0.0015, "num_tokens": 40189919.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8915329275041505, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.05413321126252413, "learning_rate": 1.3563180153126423e-05, "loss": 0.0022, "num_tokens": 40197171.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.8917173953145177, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.03542689455207437, "learning_rate": 1.356017146851527e-05, "loss": 0.0014, "num_tokens": 40201886.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8919018631248847, "frac_reward_zero_std": 1.0, "grad_norm": 0.05712890625, "kl": 0.021938129095360637, "learning_rate": 1.3557162414809213e-05, "loss": 0.0009, "num_tokens": 40207864.0, "reward": 1.7999999523162842, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8920863309352518, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.03136964561417699, "learning_rate": 1.3554152992320213e-05, "loss": 0.0013, "num_tokens": 40213322.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8922707987456189, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.039530866546556354, "learning_rate": 1.3551143201360266e-05, "loss": 0.0016, "num_tokens": 40221710.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.892455266555986, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.03659618191886693, "learning_rate": 1.354813304224141e-05, "loss": 0.0015, "num_tokens": 40225305.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 95.25, "completions/mean_terminated_length": 95.25, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.892639734366353, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.05242172966245562, "learning_rate": 1.354512251527571e-05, "loss": 0.0021, "num_tokens": 40228891.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8928242021767202, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0479169525206089, "learning_rate": 1.3542111620775287e-05, "loss": 0.0019, "num_tokens": 40233526.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8930086699870873, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.07448538532480597, "learning_rate": 1.3539100359052286e-05, "loss": 0.003, "num_tokens": 40243832.0, "reward": 1.9271653890609741, "reward_std": 0.2060074657201767, "rewards/fixed_code_pass_all_test_reward/mean": 0.9271653890609741, "rewards/fixed_code_pass_all_test_reward/std": 0.2060074806213379, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8931931377974544, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.018278325733263046, "learning_rate": 1.3536088730418897e-05, "loss": 0.0007, "num_tokens": 40248128.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 480.125, "completions/mean_terminated_length": 480.125, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.8933776056078214, "frac_reward_zero_std": 1.0, "grad_norm": 0.034912109375, "kl": 0.022718505351804197, "learning_rate": 1.3533076735187341e-05, "loss": 0.0009, "num_tokens": 40257889.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.8935620734181885, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.0413172859698534, "learning_rate": 1.3530064373669887e-05, "loss": 0.0017, "num_tokens": 40267690.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 410.75, "completions/mean_terminated_length": 410.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8937465412285556, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.062498676124960184, "learning_rate": 1.3527051646178832e-05, "loss": 0.0025, "num_tokens": 40277504.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8939310090389228, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0595300542190671, "learning_rate": 1.3524038553026519e-05, "loss": 0.0024, "num_tokens": 40286122.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.8941154768492898, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.05630644969642162, "learning_rate": 1.3521025094525323e-05, "loss": 0.0023, "num_tokens": 40289726.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.8942999446596569, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.05172666581347585, "learning_rate": 1.3518011270987661e-05, "loss": 0.0021, "num_tokens": 40298548.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 443.125, "completions/mean_terminated_length": 443.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.894484412470024, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "kl": 0.02979980653617531, "learning_rate": 1.3514997082725985e-05, "loss": 0.0012, "num_tokens": 40306061.0, "reward": 1.7000000476837158, "reward_std": 0.46598589420318604, "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, "rewards/fixed_code_pass_all_test_reward/std": 0.0534522607922554, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.894668880280391, "frac_reward_zero_std": 1.0, "grad_norm": 0.055419921875, "kl": 0.0531901849899441, "learning_rate": 1.3511982530052787e-05, "loss": 0.0021, "num_tokens": 40313571.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8948533480907581, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.05117065703961998, "learning_rate": 1.3508967613280594e-05, "loss": 0.002, "num_tokens": 40323746.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8950378159011253, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.02618949592579156, "learning_rate": 1.3505952332721976e-05, "loss": 0.001, "num_tokens": 40332183.0, "reward": 1.5, "reward_std": 0.5345224738121033, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8952222837114924, "frac_reward_zero_std": 1.0, "grad_norm": 0.390625, "kl": 0.051704831421375275, "learning_rate": 1.3502936688689534e-05, "loss": 0.0021, "num_tokens": 40338308.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8954067515218594, "frac_reward_zero_std": 1.0, "grad_norm": 0.061279296875, "kl": 0.02149977011140436, "learning_rate": 1.3499920681495915e-05, "loss": 0.0009, "num_tokens": 40344238.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/max_terminated_length": 802.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8955912193322265, "frac_reward_zero_std": 1.0, "grad_norm": 0.05029296875, "kl": 0.027827743906527758, "learning_rate": 1.349690431145379e-05, "loss": 0.0011, "num_tokens": 40350898.0, "reward": 1.0612244606018066, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.06122449040412903, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8957756871425936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0400390625, "kl": 0.022374557447619736, "learning_rate": 1.3493887578875881e-05, "loss": 0.0009, "num_tokens": 40357004.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8959601549529607, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.04444209625944495, "learning_rate": 1.3490870484074943e-05, "loss": 0.0018, "num_tokens": 40366762.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8961446227633278, "frac_reward_zero_std": 0.0, "grad_norm": 0.78515625, "kl": 0.05666601238772273, "learning_rate": 1.348785302736377e-05, "loss": 0.0023, "num_tokens": 40376755.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.8963290905736949, "frac_reward_zero_std": 1.0, "grad_norm": 0.059814453125, "kl": 0.018445522291585803, "learning_rate": 1.3484835209055188e-05, "loss": 0.0007, "num_tokens": 40381229.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.896513558384062, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.033602256793528795, "learning_rate": 1.3481817029462066e-05, "loss": 0.0013, "num_tokens": 40387988.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8966980261944291, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.04112620232626796, "learning_rate": 1.3478798488897309e-05, "loss": 0.0016, "num_tokens": 40396563.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8968824940047961, "frac_reward_zero_std": 1.0, "grad_norm": 0.041748046875, "kl": 0.03333612787537277, "learning_rate": 1.3475779587673859e-05, "loss": 0.0013, "num_tokens": 40406296.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 354.25, "completions/mean_terminated_length": 354.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8970669618151632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.03400420397520065, "learning_rate": 1.3472760326104694e-05, "loss": 0.0014, "num_tokens": 40414818.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8972514296255304, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.04012254602275789, "learning_rate": 1.3469740704502835e-05, "loss": 0.0016, "num_tokens": 40423656.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8974358974358975, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.029997592326253653, "learning_rate": 1.3466720723181337e-05, "loss": 0.0012, "num_tokens": 40431240.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8976203652462645, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.04764627886470407, "learning_rate": 1.3463700382453281e-05, "loss": 0.0019, "num_tokens": 40438333.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 439.75, "completions/mean_terminated_length": 439.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8978048330566316, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.04556818585842848, "learning_rate": 1.346067968263181e-05, "loss": 0.0018, "num_tokens": 40447027.0, "reward": 1.6785714626312256, "reward_std": 0.3393528461456299, "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, "rewards/fixed_code_pass_all_test_reward/std": 0.3393528461456299, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8979893008669987, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.04446255601942539, "learning_rate": 1.3457658624030079e-05, "loss": 0.0018, "num_tokens": 40453421.0, "reward": 1.625, "reward_std": 0.16690461337566376, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.16690459847450256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 376.625, "completions/mean_terminated_length": 376.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8981737686773658, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.03523054451216012, "learning_rate": 1.3454637206961299e-05, "loss": 0.0014, "num_tokens": 40459930.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8983582364877329, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.02882067859172821, "learning_rate": 1.3451615431738703e-05, "loss": 0.0012, "num_tokens": 40468585.0, "reward": 1.7874999046325684, "reward_std": 0.3234752416610718, "rewards/fixed_code_pass_all_test_reward/mean": 0.7874999642372131, "rewards/fixed_code_pass_all_test_reward/std": 0.3234752416610718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 377.625, "completions/mean_terminated_length": 377.625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8985427042981, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.05101708578877151, "learning_rate": 1.3448593298675577e-05, "loss": 0.002, "num_tokens": 40481894.0, "reward": 1.5, "reward_std": 0.3118034303188324, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.3118034601211548, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8987271721084671, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.042133047012612224, "learning_rate": 1.3445570808085229e-05, "loss": 0.0017, "num_tokens": 40491640.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.8989116399188342, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.04411594523116946, "learning_rate": 1.3442547960281014e-05, "loss": 0.0018, "num_tokens": 40501113.0, "reward": 1.1193182468414307, "reward_std": 0.048211827874183655, "rewards/fixed_code_pass_all_test_reward/mean": 0.11931818723678589, "rewards/fixed_code_pass_all_test_reward/std": 0.048211827874183655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8990961077292012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0439453125, "kl": 0.02667079772800207, "learning_rate": 1.343952475557632e-05, "loss": 0.0011, "num_tokens": 40509902.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8992805755395683, "frac_reward_zero_std": 1.0, "grad_norm": 0.06884765625, "kl": 0.0529350230935961, "learning_rate": 1.343650119428457e-05, "loss": 0.0021, "num_tokens": 40514124.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8994650433499355, "frac_reward_zero_std": 1.0, "grad_norm": 0.049072265625, "kl": 0.023814132902771235, "learning_rate": 1.3433477276719231e-05, "loss": 0.001, "num_tokens": 40520562.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 796.375, "completions/mean_terminated_length": 796.375, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.8996495111603026, "frac_reward_zero_std": 0.0, "grad_norm": 0.404296875, "kl": 0.017301700892858207, "learning_rate": 1.3430453003193801e-05, "loss": 0.0007, "num_tokens": 40536629.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8998339789706696, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.051614079624414444, "learning_rate": 1.3427428374021816e-05, "loss": 0.0021, "num_tokens": 40545990.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 441.125, "completions/mean_terminated_length": 441.125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.9000184467810367, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.057898911414667964, "learning_rate": 1.3424403389516854e-05, "loss": 0.0023, "num_tokens": 40558095.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 454.0, "completions/mean_terminated_length": 454.0, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.9002029145914038, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.050357280066236854, "learning_rate": 1.3421378049992514e-05, "loss": 0.002, "num_tokens": 40566423.0, "reward": 1.7916667461395264, "reward_std": 0.39591163396835327, "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 563.875, "completions/mean_terminated_length": 563.875, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.9003873824017709, "frac_reward_zero_std": 1.0, "grad_norm": 0.025634765625, "kl": 0.026917697046883404, "learning_rate": 1.3418352355762456e-05, "loss": 0.0011, "num_tokens": 40579886.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.900571850212138, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.038975171046331525, "learning_rate": 1.3415326307140355e-05, "loss": 0.0016, "num_tokens": 40589726.0, "reward": 1.53125, "reward_std": 0.7372426986694336, "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 364.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9007563180225051, "frac_reward_zero_std": 1.0, "grad_norm": 0.049560546875, "kl": 0.01788379775825888, "learning_rate": 1.3412299904439939e-05, "loss": 0.0007, "num_tokens": 40597942.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9009407858328722, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.0736178532242775, "learning_rate": 1.3409273147974957e-05, "loss": 0.0029, "num_tokens": 40607037.0, "reward": 1.90625, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 360.875, "completions/mean_terminated_length": 360.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.9011252536432393, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.042822302551940084, "learning_rate": 1.3406246038059208e-05, "loss": 0.0017, "num_tokens": 40620228.0, "reward": 1.6443965435028076, "reward_std": 0.4907793700695038, "rewards/fixed_code_pass_all_test_reward/mean": 0.6443965435028076, "rewards/fixed_code_pass_all_test_reward/std": 0.49077939987182617, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9013097214536063, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.029818235663697124, "learning_rate": 1.3403218575006522e-05, "loss": 0.0012, "num_tokens": 40630490.0, "reward": 1.7652559280395508, "reward_std": 0.4358837604522705, "rewards/fixed_code_pass_all_test_reward/mean": 0.7652559280395508, "rewards/fixed_code_pass_all_test_reward/std": 0.4358838200569153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9014941892639734, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.035602202638983727, "learning_rate": 1.3400190759130767e-05, "loss": 0.0014, "num_tokens": 40641892.0, "reward": 1.607954502105713, "reward_std": 0.04821188002824783, "rewards/fixed_code_pass_all_test_reward/mean": 0.6079545021057129, "rewards/fixed_code_pass_all_test_reward/std": 0.04821184277534485, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9016786570743405, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.02627052366733551, "learning_rate": 1.3397162590745845e-05, "loss": 0.0011, "num_tokens": 40652155.0, "reward": 1.8804347515106201, "reward_std": 0.22139178216457367, "rewards/fixed_code_pass_all_test_reward/mean": 0.8804347515106201, "rewards/fixed_code_pass_all_test_reward/std": 0.22139176726341248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9018631248847077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.02698020508978516, "learning_rate": 1.3394134070165696e-05, "loss": 0.0011, "num_tokens": 40656650.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 395.0, "completions/mean_terminated_length": 395.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9020475926950747, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.06621546461246908, "learning_rate": 1.33911051977043e-05, "loss": 0.0026, "num_tokens": 40666626.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 381.125, "completions/mean_terminated_length": 381.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.9022320605054418, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.026237699203193188, "learning_rate": 1.3388075973675667e-05, "loss": 0.001, "num_tokens": 40674627.0, "reward": 1.8571429252624512, "reward_std": 0.3499270975589752, "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.9024165283158089, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.03774978092405945, "learning_rate": 1.3385046398393851e-05, "loss": 0.0015, "num_tokens": 40684477.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.902600996126176, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.02628690586425364, "learning_rate": 1.338201647217293e-05, "loss": 0.0011, "num_tokens": 40690876.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 364.75, "completions/mean_terminated_length": 364.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.902785463936543, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.039824796142056584, "learning_rate": 1.3378986195327034e-05, "loss": 0.0016, "num_tokens": 40701586.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9029699317469102, "frac_reward_zero_std": 1.0, "grad_norm": 0.06689453125, "kl": 0.02810564578976482, "learning_rate": 1.337595556817032e-05, "loss": 0.0011, "num_tokens": 40709849.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 680.75, "completions/mean_terminated_length": 680.75, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.9031543995572773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0390625, "kl": 0.024953900603577495, "learning_rate": 1.3372924591016988e-05, "loss": 0.001, "num_tokens": 40722167.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 690.25, "completions/mean_terminated_length": 690.25, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.9033388673676443, "frac_reward_zero_std": 0.0, "grad_norm": 0.52734375, "kl": 0.016722447704523802, "learning_rate": 1.3369893264181255e-05, "loss": 0.0007, "num_tokens": 40734737.0, "reward": 1.9301470518112183, "reward_std": 0.09683255106210709, "rewards/fixed_code_pass_all_test_reward/mean": 0.9301470518112183, "rewards/fixed_code_pass_all_test_reward/std": 0.09683256596326828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.9035233351780114, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.07078809663653374, "learning_rate": 1.3366861587977406e-05, "loss": 0.0028, "num_tokens": 40739493.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.9037078029883785, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.07265775790438056, "learning_rate": 1.3363829562719737e-05, "loss": 0.0029, "num_tokens": 40743380.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9038922707987456, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.057736681774258614, "learning_rate": 1.3360797188722586e-05, "loss": 0.0023, "num_tokens": 40747544.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9040767386091128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0595703125, "kl": 0.04446895164437592, "learning_rate": 1.335776446630033e-05, "loss": 0.0018, "num_tokens": 40758652.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9042612064194798, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.042834551306441426, "learning_rate": 1.335473139576739e-05, "loss": 0.0017, "num_tokens": 40768530.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9044456742298469, "frac_reward_zero_std": 1.0, "grad_norm": 0.043212890625, "kl": 0.02027237555012107, "learning_rate": 1.3351697977438204e-05, "loss": 0.0008, "num_tokens": 40773023.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.904630142040214, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.047518352046608925, "learning_rate": 1.3348664211627265e-05, "loss": 0.0019, "num_tokens": 40780470.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.904814609850581, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.054313099244609475, "learning_rate": 1.3345630098649084e-05, "loss": 0.0022, "num_tokens": 40788590.0, "reward": 1.4375, "reward_std": 0.40779241919517517, "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, "rewards/fixed_code_pass_all_test_reward/std": 0.40779241919517517, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.9049990776609481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.07645459473133087, "learning_rate": 1.334259563881823e-05, "loss": 0.0031, "num_tokens": 40796624.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 578.75, "completions/mean_terminated_length": 578.75, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.9051835454713153, "frac_reward_zero_std": 0.0, "grad_norm": 0.89453125, "kl": 0.03981074463808909, "learning_rate": 1.3339560832449284e-05, "loss": 0.0016, "num_tokens": 40809134.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9053680132816824, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.059877749998122454, "learning_rate": 1.3336525679856877e-05, "loss": 0.0024, "num_tokens": 40818872.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9055524810920494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.027943127555772662, "learning_rate": 1.333349018135568e-05, "loss": 0.0011, "num_tokens": 40825115.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 346.375, "completions/mean_terminated_length": 346.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9057369489024165, "frac_reward_zero_std": 1.0, "grad_norm": 0.06298828125, "kl": 0.03194777714088559, "learning_rate": 1.333045433726039e-05, "loss": 0.0013, "num_tokens": 40836198.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 231.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9059214167127836, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.05417779728304595, "learning_rate": 1.332741814788574e-05, "loss": 0.0022, "num_tokens": 40841337.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 113.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9061058845231507, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.15514911990612745, "learning_rate": 1.3324381613546505e-05, "loss": 0.0062, "num_tokens": 40844968.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 479.875, "completions/mean_terminated_length": 479.875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.9062903523335178, "frac_reward_zero_std": 1.0, "grad_norm": 0.048828125, "kl": 0.027204880490899086, "learning_rate": 1.3321344734557488e-05, "loss": 0.0011, "num_tokens": 40858007.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 362.375, "completions/mean_terminated_length": 362.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9064748201438849, "frac_reward_zero_std": 0.0, "grad_norm": 0.875, "kl": 0.044925668742507696, "learning_rate": 1.3318307511233542e-05, "loss": 0.0018, "num_tokens": 40865354.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 426.25, "completions/mean_terminated_length": 426.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.906659287954252, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.04387883469462395, "learning_rate": 1.3315269943889536e-05, "loss": 0.0018, "num_tokens": 40877548.0, "reward": 1.8406250476837158, "reward_std": 0.3204175531864166, "rewards/fixed_code_pass_all_test_reward/mean": 0.8406250476837158, "rewards/fixed_code_pass_all_test_reward/std": 0.320417582988739, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9068437557646191, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.04857798223383725, "learning_rate": 1.3312232032840391e-05, "loss": 0.0019, "num_tokens": 40882230.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.9070282235749861, "frac_reward_zero_std": 1.0, "grad_norm": 0.054931640625, "kl": 0.022467744420282543, "learning_rate": 1.3309193778401055e-05, "loss": 0.0009, "num_tokens": 40887916.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.9072126913853532, "frac_reward_zero_std": 1.0, "grad_norm": 0.3828125, "kl": 0.057570791454054415, "learning_rate": 1.3306155180886517e-05, "loss": 0.0023, "num_tokens": 40892032.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.9073971591957204, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.034020210383459926, "learning_rate": 1.3303116240611793e-05, "loss": 0.0014, "num_tokens": 40897362.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9075816270060875, "frac_reward_zero_std": 1.0, "grad_norm": 0.55078125, "kl": 0.039389976183883846, "learning_rate": 1.3300076957891946e-05, "loss": 0.0016, "num_tokens": 40906701.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 484.875, "completions/mean_terminated_length": 484.875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.9077660948164545, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.032578504644334316, "learning_rate": 1.3297037333042065e-05, "loss": 0.0013, "num_tokens": 40915932.0, "reward": 1.0833332538604736, "reward_std": 0.0690065547823906, "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, "rewards/fixed_code_pass_all_test_reward/std": 0.06900656223297119, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9079505626268216, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.02605132490862161, "learning_rate": 1.3293997366377278e-05, "loss": 0.001, "num_tokens": 40921753.0, "reward": 1.53125, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9081350304371887, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.04977713106200099, "learning_rate": 1.329095705821275e-05, "loss": 0.002, "num_tokens": 40930940.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 334.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.9083194982475558, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.04050698480568826, "learning_rate": 1.3287916408863679e-05, "loss": 0.0016, "num_tokens": 40940459.0, "reward": 1.8125, "reward_std": 0.035355329513549805, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.035355325788259506, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 633.625, "completions/mean_terminated_length": 633.625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.9085039660579229, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.03587809926830232, "learning_rate": 1.32848754186453e-05, "loss": 0.0014, "num_tokens": 40953760.0, "reward": 1.797619104385376, "reward_std": 0.3781786859035492, "rewards/fixed_code_pass_all_test_reward/mean": 0.9226190447807312, "rewards/fixed_code_pass_all_test_reward/std": 0.20031964778900146, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.90868843386829, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.07274649292230606, "learning_rate": 1.3281834087872882e-05, "loss": 0.0029, "num_tokens": 40957852.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.9088729016786571, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.019918264646548778, "learning_rate": 1.327879241686173e-05, "loss": 0.0008, "num_tokens": 40967792.0, "reward": 1.2411764860153198, "reward_std": 0.010892018675804138, "rewards/fixed_code_pass_all_test_reward/mean": 0.24117647111415863, "rewards/fixed_code_pass_all_test_reward/std": 0.010892000049352646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 434.0, "completions/mean_terminated_length": 434.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9090573694890242, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.03044092608615756, "learning_rate": 1.3275750405927186e-05, "loss": 0.0012, "num_tokens": 40978528.0, "reward": 1.8125, "reward_std": 0.3720118999481201, "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9092418372993912, "frac_reward_zero_std": 1.0, "grad_norm": 0.06396484375, "kl": 0.029675901168957353, "learning_rate": 1.3272708055384624e-05, "loss": 0.0012, "num_tokens": 40982740.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9094263051097583, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.09727480821311474, "learning_rate": 1.3269665365549454e-05, "loss": 0.0039, "num_tokens": 40986671.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.9096107729201255, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.025985549087636173, "learning_rate": 1.3266622336737123e-05, "loss": 0.001, "num_tokens": 40990496.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 390.0, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9097952407304926, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.030854206532239914, "learning_rate": 1.326357896926311e-05, "loss": 0.0012, "num_tokens": 40998160.0, "reward": 1.8392857313156128, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 420.375, "completions/mean_terminated_length": 420.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.9099797085408596, "frac_reward_zero_std": 1.0, "grad_norm": 0.052734375, "kl": 0.024414860527031124, "learning_rate": 1.3260535263442936e-05, "loss": 0.001, "num_tokens": 41009483.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 429.75, "completions/mean_terminated_length": 429.75, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.9101641763512267, "frac_reward_zero_std": 0.0, "grad_norm": 0.7734375, "kl": 0.026198599487543106, "learning_rate": 1.3257491219592145e-05, "loss": 0.001, "num_tokens": 41022369.0, "reward": 1.7268518209457397, "reward_std": 0.4350585341453552, "rewards/fixed_code_pass_all_test_reward/mean": 0.7268518209457397, "rewards/fixed_code_pass_all_test_reward/std": 0.4350585341453552, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.9103486441615938, "frac_reward_zero_std": 1.0, "grad_norm": 0.267578125, "kl": 0.05025681125698611, "learning_rate": 1.325444683802633e-05, "loss": 0.002, "num_tokens": 41032110.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 341.0, "completions/mean_terminated_length": 341.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9105331119719609, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.03958134073764086, "learning_rate": 1.3251402119061105e-05, "loss": 0.0016, "num_tokens": 41042686.0, "reward": 1.8164557218551636, "reward_std": 0.2552356421947479, "rewards/fixed_code_pass_all_test_reward/mean": 0.8164557218551636, "rewards/fixed_code_pass_all_test_reward/std": 0.2552356719970703, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 299.25, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.910717579782328, "frac_reward_zero_std": 1.0, "grad_norm": 0.05322265625, "kl": 0.025689915637485683, "learning_rate": 1.3248357063012135e-05, "loss": 0.001, "num_tokens": 41050920.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 436.625, "completions/mean_terminated_length": 436.625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.9109020475926951, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.03860212117433548, "learning_rate": 1.3245311670195108e-05, "loss": 0.0015, "num_tokens": 41061181.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9110865154030622, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.04649002104997635, "learning_rate": 1.3242265940925743e-05, "loss": 0.0019, "num_tokens": 41069355.0, "reward": 0.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 379.375, "completions/mean_terminated_length": 379.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.9112709832134293, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.0359058219473809, "learning_rate": 1.323921987551981e-05, "loss": 0.0014, "num_tokens": 41077950.0, "reward": 1.5, "reward_std": 0.4432026147842407, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 421.25, "completions/mean_terminated_length": 421.25, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.9114554510237963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0556640625, "kl": 0.026673976564779878, "learning_rate": 1.3236173474293102e-05, "loss": 0.0011, "num_tokens": 41090944.0, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.9116399188341634, "frac_reward_zero_std": 0.0, "grad_norm": 0.890625, "kl": 0.028657105984166265, "learning_rate": 1.3233126737561448e-05, "loss": 0.0011, "num_tokens": 41097573.0, "reward": 1.9525861740112305, "reward_std": 0.13410648703575134, "rewards/fixed_code_pass_all_test_reward/mean": 0.9525861740112305, "rewards/fixed_code_pass_all_test_reward/std": 0.13410647213459015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9118243866445306, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.030189326615072787, "learning_rate": 1.3230079665640711e-05, "loss": 0.0012, "num_tokens": 41104088.0, "reward": 1.8229167461395264, "reward_std": 0.29693377017974854, "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166269302368, "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9120088544548977, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.030878696125000715, "learning_rate": 1.3227032258846799e-05, "loss": 0.0012, "num_tokens": 41110715.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9121933222652647, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.03172890481073409, "learning_rate": 1.3223984517495643e-05, "loss": 0.0013, "num_tokens": 41121074.0, "reward": 1.25, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.25, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 382.0, "completions/mean_terminated_length": 382.0, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.9123777900756318, "frac_reward_zero_std": 1.0, "grad_norm": 0.0291748046875, "kl": 0.014575378852896392, "learning_rate": 1.3220936441903212e-05, "loss": 0.0006, "num_tokens": 41129482.0, "reward": 1.1333333253860474, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9125622578859989, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.0360656474949792, "learning_rate": 1.3217888032385508e-05, "loss": 0.0014, "num_tokens": 41138448.0, "reward": 1.4439655542373657, "reward_std": 0.43076345324516296, "rewards/fixed_code_pass_all_test_reward/mean": 0.44396552443504333, "rewards/fixed_code_pass_all_test_reward/std": 0.43076348304748535, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.912746725696366, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.04588445834815502, "learning_rate": 1.3214839289258574e-05, "loss": 0.0018, "num_tokens": 41148582.0, "reward": 1.9945652484893799, "reward_std": 0.015371870249509811, "rewards/fixed_code_pass_all_test_reward/mean": 0.9945652484893799, "rewards/fixed_code_pass_all_test_reward/std": 0.015371883288025856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.9129311935067331, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.029633078491315246, "learning_rate": 1.321179021283848e-05, "loss": 0.0012, "num_tokens": 41152495.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9131156613171002, "frac_reward_zero_std": 1.0, "grad_norm": 0.05908203125, "kl": 0.027881750371307135, "learning_rate": 1.3208740803441336e-05, "loss": 0.0011, "num_tokens": 41159060.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.9133001291274673, "frac_reward_zero_std": 1.0, "grad_norm": 0.060546875, "kl": 0.03247127577196807, "learning_rate": 1.3205691061383283e-05, "loss": 0.0013, "num_tokens": 41165782.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 454.75, "completions/mean_terminated_length": 454.75, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9134845969378343, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.05534623563289642, "learning_rate": 1.3202640986980504e-05, "loss": 0.0022, "num_tokens": 41177812.0, "reward": 1.9456522464752197, "reward_std": 0.15371885895729065, "rewards/fixed_code_pass_all_test_reward/mean": 0.945652186870575, "rewards/fixed_code_pass_all_test_reward/std": 0.15371887385845184, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 471.375, "completions/mean_terminated_length": 471.375, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.9136690647482014, "frac_reward_zero_std": 1.0, "grad_norm": 0.034912109375, "kl": 0.02334114466793835, "learning_rate": 1.31995905805492e-05, "loss": 0.0009, "num_tokens": 41186967.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9138535325585685, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.05870185559615493, "learning_rate": 1.3196539842405625e-05, "loss": 0.0023, "num_tokens": 41193794.0, "reward": 1.9895833730697632, "reward_std": 0.029462741687893867, "rewards/fixed_code_pass_all_test_reward/mean": 0.9895833730697632, "rewards/fixed_code_pass_all_test_reward/std": 0.029462775215506554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9140380003689356, "frac_reward_zero_std": 1.0, "grad_norm": 0.04345703125, "kl": 0.030987567268311977, "learning_rate": 1.3193488772866055e-05, "loss": 0.0012, "num_tokens": 41200448.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 389.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.9142224681793027, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.05428264872170985, "learning_rate": 1.3190437372246807e-05, "loss": 0.0022, "num_tokens": 41210088.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9144069359896698, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.02757007849868387, "learning_rate": 1.3187385640864227e-05, "loss": 0.0011, "num_tokens": 41216994.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9145914038000369, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.020145446644164622, "learning_rate": 1.3184333579034703e-05, "loss": 0.0008, "num_tokens": 41221281.0, "reward": 1.125, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.125, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.914775871610404, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.03514883737079799, "learning_rate": 1.3181281187074648e-05, "loss": 0.0014, "num_tokens": 41229110.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.914960339420771, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.021690602996386588, "learning_rate": 1.3178228465300516e-05, "loss": 0.0009, "num_tokens": 41234244.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9151448072311381, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.0181016429560259, "learning_rate": 1.3175175414028792e-05, "loss": 0.0007, "num_tokens": 41245774.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 426.5, "completions/mean_terminated_length": 426.5, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.9153292750415053, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.03917866060510278, "learning_rate": 1.3172122033575995e-05, "loss": 0.0016, "num_tokens": 41254050.0, "reward": 1.625, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.625, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 379.625, "completions/mean_terminated_length": 379.625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9155137428518724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0673828125, "kl": 0.04402852681232616, "learning_rate": 1.3169068324258683e-05, "loss": 0.0018, "num_tokens": 41264079.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 548.625, "completions/mean_terminated_length": 548.625, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.9156982106622394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0218505859375, "kl": 0.01146214158507064, "learning_rate": 1.3166014286393443e-05, "loss": 0.0005, "num_tokens": 41274748.0, "reward": 1.2727272510528564, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 490.375, "completions/mean_terminated_length": 490.375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.9158826784726065, "frac_reward_zero_std": 0.0, "grad_norm": 0.69140625, "kl": 0.020233368617482483, "learning_rate": 1.3162959920296895e-05, "loss": 0.0008, "num_tokens": 41283831.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.9160671462829736, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.04282290278933942, "learning_rate": 1.3159905226285692e-05, "loss": 0.0017, "num_tokens": 41287916.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9162516140933407, "frac_reward_zero_std": 1.0, "grad_norm": 0.050537109375, "kl": 0.022994797094725072, "learning_rate": 1.3156850204676532e-05, "loss": 0.0009, "num_tokens": 41294101.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9164360819037078, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.03972340631298721, "learning_rate": 1.315379485578614e-05, "loss": 0.0016, "num_tokens": 41301542.0, "reward": 1.8801021575927734, "reward_std": 0.022979410365223885, "rewards/fixed_code_pass_all_test_reward/mean": 0.8801020383834839, "rewards/fixed_code_pass_all_test_reward/std": 0.022979410365223885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 362.375, "completions/mean_terminated_length": 362.375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.9166205497140749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.03181058750487864, "learning_rate": 1.3150739179931265e-05, "loss": 0.0013, "num_tokens": 41309473.0, "reward": 1.692307710647583, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.916805017524442, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.05006179539486766, "learning_rate": 1.3147683177428708e-05, "loss": 0.002, "num_tokens": 41320686.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9169894853348091, "frac_reward_zero_std": 1.0, "grad_norm": 0.072265625, "kl": 0.03785826195962727, "learning_rate": 1.3144626848595288e-05, "loss": 0.0015, "num_tokens": 41328905.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 342.5, "completions/mean_terminated_length": 342.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.9171739531451761, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.022780447150580585, "learning_rate": 1.3141570193747873e-05, "loss": 0.0009, "num_tokens": 41340749.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9173584209555432, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.029114579083397985, "learning_rate": 1.313851321320335e-05, "loss": 0.0012, "num_tokens": 41344668.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.9175428887659104, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.049869046779349446, "learning_rate": 1.3135455907278647e-05, "loss": 0.002, "num_tokens": 41352789.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9177273565762775, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.0696502416394651, "learning_rate": 1.3132398276290727e-05, "loss": 0.0028, "num_tokens": 41358947.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 711.625, "completions/mean_terminated_length": 711.625, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "epoch": 0.9179118243866445, "frac_reward_zero_std": 0.0, "grad_norm": 0.73828125, "kl": 0.025526192388497293, "learning_rate": 1.3129340320556587e-05, "loss": 0.001, "num_tokens": 41376120.0, "reward": 1.625, "reward_std": 0.6943650841712952, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9180962921970116, "frac_reward_zero_std": 1.0, "grad_norm": 0.353515625, "kl": 0.053649333538487554, "learning_rate": 1.3126282040393252e-05, "loss": 0.0021, "num_tokens": 41381671.0, "reward": 1.5, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.5, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 341.0, "completions/mean_terminated_length": 341.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9182807600073787, "frac_reward_zero_std": 1.0, "grad_norm": 0.044921875, "kl": 0.02353702951222658, "learning_rate": 1.3123223436117782e-05, "loss": 0.0009, "num_tokens": 41388783.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9184652278177458, "frac_reward_zero_std": 1.0, "grad_norm": 0.06005859375, "kl": 0.02939384605269879, "learning_rate": 1.3120164508047282e-05, "loss": 0.0012, "num_tokens": 41394008.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 374.5, "completions/mean_terminated_length": 374.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.9186496956281129, "frac_reward_zero_std": 0.0, "grad_norm": 0.95703125, "kl": 0.03050274634733796, "learning_rate": 1.311710525649887e-05, "loss": 0.0012, "num_tokens": 41403284.0, "reward": 1.90625, "reward_std": 0.0578637570142746, "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 284.75, "completions/mean_terminated_length": 284.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.91883416343848, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.04935234133154154, "learning_rate": 1.3114045681789716e-05, "loss": 0.002, "num_tokens": 41412170.0, "reward": 1.375, "reward_std": 0.5175491571426392, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9190186312488471, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.05447501584421843, "learning_rate": 1.3110985784237014e-05, "loss": 0.0022, "num_tokens": 41425221.0, "reward": 1.4791667461395264, "reward_std": 0.11086282134056091, "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, "rewards/fixed_code_pass_all_test_reward/std": 0.11086282134056091, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.9192030990592142, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.047786006703972816, "learning_rate": 1.3107925564157997e-05, "loss": 0.0019, "num_tokens": 41433234.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 310.875, "completions/mean_terminated_length": 310.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9193875668695812, "frac_reward_zero_std": 1.0, "grad_norm": 0.107421875, "kl": 0.04382905806414783, "learning_rate": 1.3104865021869923e-05, "loss": 0.0018, "num_tokens": 41442713.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9195720346799483, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.03923512948676944, "learning_rate": 1.310180415769009e-05, "loss": 0.0016, "num_tokens": 41451725.0, "reward": 1.75, "reward_std": 0.4629100561141968, "rewards/fixed_code_pass_all_test_reward/mean": 0.75, "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9197565024903155, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.16749763232655823, "learning_rate": 1.3098742971935831e-05, "loss": 0.0067, "num_tokens": 41462044.0, "reward": 1.8888888359069824, "reward_std": 0.31426966190338135, "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 552.25, "completions/mean_terminated_length": 552.25, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.9199409703006826, "frac_reward_zero_std": 0.0, "grad_norm": 0.59765625, "kl": 0.017881876439787447, "learning_rate": 1.309568146492451e-05, "loss": 0.0007, "num_tokens": 41476670.0, "reward": 1.1944444179534912, "reward_std": 0.07856737822294235, "rewards/fixed_code_pass_all_test_reward/mean": 0.1944444477558136, "rewards/fixed_code_pass_all_test_reward/std": 0.07856741547584534, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9201254381110496, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.058120565954595804, "learning_rate": 1.3092619636973517e-05, "loss": 0.0023, "num_tokens": 41484880.0, "reward": 1.875, "reward_std": 0.3535533845424652, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9203099059214167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.0341759700095281, "learning_rate": 1.3089557488400288e-05, "loss": 0.0014, "num_tokens": 41488872.0, "reward": 1.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 0.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 337.125, "completions/mean_terminated_length": 337.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9204943737317838, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.1078754581976682, "learning_rate": 1.3086495019522285e-05, "loss": 0.0043, "num_tokens": 41497321.0, "reward": 1.125, "reward_std": 0.8345229625701904, "rewards/fixed_code_pass_all_test_reward/mean": 0.375, "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4629100561141968, "step": 4990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 307.125, "completions/mean_terminated_length": 307.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9206788415421508, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.023799549555405974, "learning_rate": 1.3083432230657006e-05, "loss": 0.001, "num_tokens": 41504170.0, "reward": 1.8379629850387573, "reward_std": 0.06547286361455917, "rewards/fixed_code_pass_all_test_reward/mean": 0.8379629850387573, "rewards/fixed_code_pass_all_test_reward/std": 0.06547285616397858, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.920863309352518, "frac_reward_zero_std": 1.0, "grad_norm": 0.181640625, "kl": 0.059338051825761795, "learning_rate": 1.3080369122121974e-05, "loss": 0.0024, "num_tokens": 41510212.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 262.75, "completions/mean_terminated_length": 262.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9210477771628851, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.054949962766841054, "learning_rate": 1.307730569423476e-05, "loss": 0.0022, "num_tokens": 41518826.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 1099.625, "completions/mean_terminated_length": 1099.625, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 0.9212322449732522, "frac_reward_zero_std": 0.0, "grad_norm": 0.27734375, "kl": 0.008876230422174558, "learning_rate": 1.3074241947312954e-05, "loss": 0.0004, "num_tokens": 41545151.0, "reward": 1.89673912525177, "reward_std": 0.2920658588409424, "rewards/fixed_code_pass_all_test_reward/mean": 0.89673912525177, "rewards/fixed_code_pass_all_test_reward/std": 0.29206582903862, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9214167127836193, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.04390677623450756, "learning_rate": 1.3071177881674189e-05, "loss": 0.0018, "num_tokens": 41552346.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.9216011805939863, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.06578104849904776, "learning_rate": 1.306811349763612e-05, "loss": 0.0026, "num_tokens": 41561491.0, "reward": 1.8849999904632568, "reward_std": 0.32526910305023193, "rewards/fixed_code_pass_all_test_reward/mean": 0.8849999904632568, "rewards/fixed_code_pass_all_test_reward/std": 0.3252691328525543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 691.125, "completions/mean_terminated_length": 691.125, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.9217856484043534, "frac_reward_zero_std": 0.0, "grad_norm": 0.80859375, "kl": 0.019629077869467437, "learning_rate": 1.306504879551645e-05, "loss": 0.0008, "num_tokens": 41578516.0, "reward": 1.59375, "reward_std": 0.2651650309562683, "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 385.5, "completions/mean_terminated_length": 385.5, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.9219701162147206, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.026673778309486806, "learning_rate": 1.3061983775632904e-05, "loss": 0.0011, "num_tokens": 41586336.0, "reward": 1.8953487873077393, "reward_std": 0.03288870304822922, "rewards/fixed_code_pass_all_test_reward/mean": 0.895348846912384, "rewards/fixed_code_pass_all_test_reward/std": 0.03288870304822922, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 4998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 386.25, "completions/mean_terminated_length": 386.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.9221545840250877, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.05683528119698167, "learning_rate": 1.3058918438303245e-05, "loss": 0.0023, "num_tokens": 41600298.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/fixed_code_pass_all_test_reward/mean": 0.875, "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3535533845424652, "step": 4999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9223390518354547, "frac_reward_zero_std": 1.0, "grad_norm": 0.96875, "kl": 0.1047405373537913, "learning_rate": 1.305585278384526e-05, "loss": 0.0042, "num_tokens": 41608136.0, "reward": 2.0, "reward_std": 0.0, "rewards/fixed_code_pass_all_test_reward/mean": 1.0, "rewards/fixed_code_pass_all_test_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 5000 } ], "logging_steps": 1, "max_steps": 10842, "num_input_tokens_seen": 41608136, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }