diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9223390518354547, + "eval_steps": 5000, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1263.0, + "completions/max_terminated_length": 1263.0, + "completions/mean_length": 646.5, + "completions/mean_terminated_length": 646.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.00018446781036709093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 9556.0, + "reward": 1.3977272510528564, + "reward_std": 0.7336369156837463, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.42362356185913086, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.00036893562073418186, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 1.8433179723502305e-08, + "loss": -0.0, + "num_tokens": 17493.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 336.375, + "completions/mean_terminated_length": 336.375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.0005534034311012728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.00016184583455469692, + "learning_rate": 3.686635944700461e-08, + "loss": 0.0, + "num_tokens": 26416.0, + "reward": 1.462499976158142, + "reward_std": 0.329712450504303, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5874999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.318696528673172, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 284.625, + "completions/mean_terminated_length": 284.625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.0007378712414683637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.0001465609470869822, + "learning_rate": 5.529953917050692e-08, + "loss": 0.0, + "num_tokens": 35517.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 396.25, + "completions/mean_terminated_length": 396.25, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.0009223390518354548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.00013932313049735967, + "learning_rate": 7.373271889400922e-08, + "loss": 0.0, + "num_tokens": 42935.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 602.75, + "completions/mean_terminated_length": 602.75, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.0011068068622025456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.00022564510436495766, + "learning_rate": 9.216589861751152e-08, + "loss": 0.0, + "num_tokens": 54309.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 499.125, + "completions/mean_terminated_length": 499.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.0012912746725696365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.0002626226232678164, + "learning_rate": 1.1059907834101384e-07, + "loss": 0.0, + "num_tokens": 63726.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 296.0, + "completions/mean_terminated_length": 296.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0014757424829367274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.00026422405971970875, + "learning_rate": 1.2903225806451614e-07, + "loss": 0.0, + "num_tokens": 69214.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 418.625, + "completions/mean_terminated_length": 418.625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.0016602102933038186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.00015682746834499994, + "learning_rate": 1.4746543778801844e-07, + "loss": 0.0, + "num_tokens": 79675.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 434.625, + "completions/mean_terminated_length": 434.625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.0018446781036709095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.00019636107299447758, + "learning_rate": 1.6589861751152077e-07, + "loss": 0.0, + "num_tokens": 87800.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 161.5, + "completions/mean_terminated_length": 161.5, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.0020291459140380002, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.00018017001002590405, + "learning_rate": 1.8433179723502305e-07, + "loss": 0.0, + "num_tokens": 91860.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 539.25, + "completions/mean_terminated_length": 539.25, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.002213613724405091, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.00024090245096886065, + "learning_rate": 2.0276497695852537e-07, + "loss": 0.0, + "num_tokens": 105910.0, + "reward": 1.0625, + "reward_std": 0.6813851594924927, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.32732683420181274, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 331.625, + "completions/mean_terminated_length": 331.625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.002398081534772182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.00023027175666356925, + "learning_rate": 2.2119815668202768e-07, + "loss": 0.0, + "num_tokens": 112539.0, + "reward": 0.776442289352417, + "reward_std": 0.4996762275695801, + "rewards/fixed_code_pass_all_test_reward/mean": 0.401442289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.2804637849330902, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 332.75, + "completions/mean_terminated_length": 332.75, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.002582549345139273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.0002332902204216225, + "learning_rate": 2.3963133640553e-07, + "loss": 0.0, + "num_tokens": 120241.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 399.25, + "completions/mean_terminated_length": 399.25, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.002767017155506364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.00019597944447014015, + "learning_rate": 2.580645161290323e-07, + "loss": 0.0, + "num_tokens": 129275.0, + "reward": 1.5431817770004272, + "reward_std": 0.4559911787509918, + "rewards/fixed_code_pass_all_test_reward/mean": 0.668181836605072, + "rewards/fixed_code_pass_all_test_reward/std": 0.4215841293334961, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 278.75, + "completions/mean_terminated_length": 278.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.002951484965873455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.00018851919548978913, + "learning_rate": 2.764976958525346e-07, + "loss": 0.0, + "num_tokens": 134441.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 604.375, + "completions/mean_terminated_length": 604.375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.003135952776240546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.00017959067736228462, + "learning_rate": 2.949308755760369e-07, + "loss": 0.0, + "num_tokens": 144380.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 376.125, + "completions/mean_terminated_length": 376.125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.003320420586607637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0002442944605718367, + "learning_rate": 3.133640552995392e-07, + "loss": 0.0, + "num_tokens": 151581.0, + "reward": 0.7074999809265137, + "reward_std": 0.6952028870582581, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20749999582767487, + "rewards/fixed_code_pass_all_test_reward/std": 0.3301839530467987, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.003504888396974728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.00024068942002486438, + "learning_rate": 3.3179723502304154e-07, + "loss": 0.0, + "num_tokens": 156697.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 677.375, + "completions/mean_terminated_length": 677.375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.003689356207341819, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.00015377472118416335, + "learning_rate": 3.5023041474654376e-07, + "loss": 0.0, + "num_tokens": 172716.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 290.0, + "completions/mean_terminated_length": 290.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.00387382401770891, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.00029984075081301853, + "learning_rate": 3.686635944700461e-07, + "loss": 0.0, + "num_tokens": 177988.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1787.0, + "completions/max_terminated_length": 1787.0, + "completions/mean_length": 619.0, + "completions/mean_terminated_length": 619.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.0040582918280760005, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.671875, + "kl": 0.00015745200380479218, + "learning_rate": 3.870967741935484e-07, + "loss": 0.0, + "num_tokens": 192316.0, + "reward": 0.8888888359069824, + "reward_std": 0.3596327006816864, + "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, + "rewards/fixed_code_pass_all_test_reward/std": 0.019168488681316376, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 539.375, + "completions/mean_terminated_length": 539.375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.004242759638443091, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.0001711196609903709, + "learning_rate": 4.0552995391705075e-07, + "loss": 0.0, + "num_tokens": 201975.0, + "reward": 0.5089285969734192, + "reward_std": 0.5364790558815002, + "rewards/fixed_code_pass_all_test_reward/mean": 0.008928571827709675, + "rewards/fixed_code_pass_all_test_reward/std": 0.007393559440970421, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 565.625, + "completions/mean_terminated_length": 565.625, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.004427227448810182, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.0002451626151014352, + "learning_rate": 4.23963133640553e-07, + "loss": 0.0, + "num_tokens": 217588.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 597.875, + "completions/mean_terminated_length": 597.875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.004611695259177273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.00022689109391649254, + "learning_rate": 4.4239631336405535e-07, + "loss": 0.0, + "num_tokens": 232307.0, + "reward": 1.3095238208770752, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4345238208770752, + "rewards/fixed_code_pass_all_test_reward/std": 0.1912188082933426, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 280.25, + "completions/mean_terminated_length": 280.25, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.004796163069544364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.00022167046336107887, + "learning_rate": 4.608294930875577e-07, + "loss": 0.0, + "num_tokens": 240109.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 226.5, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.004980630879911455, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.0002217927240053541, + "learning_rate": 4.7926267281106e-07, + "loss": 0.0, + "num_tokens": 244777.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 557.0, + "completions/mean_terminated_length": 557.0, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.005165098690278546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.0001640177094941464, + "learning_rate": 4.976958525345623e-07, + "loss": 0.0, + "num_tokens": 255409.0, + "reward": 1.5416667461395264, + "reward_std": 0.8275063037872314, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.374898761510849, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 486.25, + "completions/mean_terminated_length": 486.25, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.005349566500645637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.00023256600616150536, + "learning_rate": 5.161290322580646e-07, + "loss": 0.0, + "num_tokens": 266307.0, + "reward": 1.262195110321045, + "reward_std": 0.5335278511047363, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3871951103210449, + "rewards/fixed_code_pass_all_test_reward/std": 0.3236880600452423, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 458.5, + "completions/mean_terminated_length": 458.5, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.005534034311012728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.00022078821348259225, + "learning_rate": 5.345622119815668e-07, + "loss": 0.0, + "num_tokens": 274935.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1012.5, + "completions/mean_terminated_length": 667.3333740234375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.005718502121379819, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.00015209946104732808, + "learning_rate": 5.529953917050692e-07, + "loss": 0.0, + "num_tokens": 289515.0, + "reward": 1.02734375, + "reward_std": 0.6355004906654358, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40234375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3611418306827545, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 415.875, + "completions/mean_terminated_length": 415.875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.00590296993174691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.00021360143909987528, + "learning_rate": 5.714285714285715e-07, + "loss": 0.0, + "num_tokens": 296794.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 450.75, + "completions/mean_terminated_length": 450.75, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.006087437742114001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.00021438846397359157, + "learning_rate": 5.898617511520738e-07, + "loss": 0.0, + "num_tokens": 307232.0, + "reward": 0.8928571343421936, + "reward_std": 0.8390957117080688, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 385.75, + "completions/mean_terminated_length": 385.75, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.006271905552481092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0034027099609375, + "kl": 0.00017366967131238198, + "learning_rate": 6.082949308755762e-07, + "loss": 0.0, + "num_tokens": 313486.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 599.875, + "completions/mean_terminated_length": 599.875, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.006456373362848183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.00015820199132576818, + "learning_rate": 6.267281105990784e-07, + "loss": 0.0, + "num_tokens": 322501.0, + "reward": 1.6428571939468384, + "reward_std": 0.38936299085617065, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.10628911107778549, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 427.125, + "completions/mean_terminated_length": 427.125, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.006640841173215274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.0001587776869200752, + "learning_rate": 6.451612903225807e-07, + "loss": 0.0, + "num_tokens": 334158.0, + "reward": 1.4083333015441895, + "reward_std": 0.7391006946563721, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6583333611488342, + "rewards/fixed_code_pass_all_test_reward/std": 0.3215784728527069, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 553.875, + "completions/mean_terminated_length": 553.875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.006825308983582365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.00016334438805643003, + "learning_rate": 6.635944700460831e-07, + "loss": 0.0, + "num_tokens": 345949.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 600.75, + "completions/mean_terminated_length": 600.75, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.007009776793949456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.00028653825029323343, + "learning_rate": 6.820276497695854e-07, + "loss": 0.0, + "num_tokens": 355195.0, + "reward": 0.8265306353569031, + "reward_std": 0.5420389175415039, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3265306353569031, + "rewards/fixed_code_pass_all_test_reward/std": 0.20755070447921753, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1452.0, + "completions/max_terminated_length": 1452.0, + "completions/mean_length": 793.625, + "completions/mean_terminated_length": 793.625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.007194244604316547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.0002119701912306482, + "learning_rate": 7.004608294930875e-07, + "loss": 0.0, + "num_tokens": 366856.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 501.75, + "completions/mean_terminated_length": 501.75, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.007378712414683638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.0001861243126768386, + "learning_rate": 7.1889400921659e-07, + "loss": 0.0, + "num_tokens": 375366.0, + "reward": 0.699999988079071, + "reward_std": 0.6141195893287659, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32499998807907104, + "rewards/fixed_code_pass_all_test_reward/std": 0.3195979595184326, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 438.125, + "completions/mean_terminated_length": 438.125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.007563180225050729, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.0002582957795311813, + "learning_rate": 7.373271889400922e-07, + "loss": 0.0, + "num_tokens": 383559.0, + "reward": 0.83984375, + "reward_std": 0.4687314033508301, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08984375, + "rewards/fixed_code_pass_all_test_reward/std": 0.08749601244926453, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 443.75, + "completions/mean_terminated_length": 443.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.00774764803541782, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.0002460242722008843, + "learning_rate": 7.557603686635945e-07, + "loss": 0.0, + "num_tokens": 392973.0, + "reward": 1.1982758045196533, + "reward_std": 0.8355019688606262, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4482758641242981, + "rewards/fixed_code_pass_all_test_reward/std": 0.4770955741405487, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 293.5, + "completions/mean_terminated_length": 293.5, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.00793211584578491, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.00019570136464608368, + "learning_rate": 7.741935483870968e-07, + "loss": 0.0, + "num_tokens": 401985.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 318.375, + "completions/mean_terminated_length": 318.375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.008116583656152001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.0001460007972582389, + "learning_rate": 7.926267281105991e-07, + "loss": 0.0, + "num_tokens": 407596.0, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 424.375, + "completions/mean_terminated_length": 424.375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.008301051466519093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.00017994168865698157, + "learning_rate": 8.110599078341015e-07, + "loss": 0.0, + "num_tokens": 415879.0, + "reward": 0.8928571343421936, + "reward_std": 0.6468132138252258, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, + "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 369.125, + "completions/mean_terminated_length": 369.125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.008485519276886183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0001759659353410825, + "learning_rate": 8.294930875576038e-07, + "loss": 0.0, + "num_tokens": 422768.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 696.375, + "completions/mean_terminated_length": 696.375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.008669987087253275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.00017174023378174752, + "learning_rate": 8.47926267281106e-07, + "loss": 0.0, + "num_tokens": 434227.0, + "reward": 1.4757652282714844, + "reward_std": 0.46155646443367004, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7257653474807739, + "rewards/fixed_code_pass_all_test_reward/std": 0.27170121669769287, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 342.125, + "completions/mean_terminated_length": 342.125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.008854454897620365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.00018270496093464317, + "learning_rate": 8.663594470046084e-07, + "loss": 0.0, + "num_tokens": 445524.0, + "reward": 0.9421296715736389, + "reward_std": 0.3907661437988281, + "rewards/fixed_code_pass_all_test_reward/mean": 0.06712962687015533, + "rewards/fixed_code_pass_all_test_reward/std": 0.09229449927806854, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 233.0, + "completions/mean_terminated_length": 233.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.009038922707987456, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.5, + "kl": 0.00014784874838369433, + "learning_rate": 8.847926267281107e-07, + "loss": 0.0, + "num_tokens": 450820.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 656.0, + "completions/mean_terminated_length": 656.0, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.009223390518354546, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.0001441684862584225, + "learning_rate": 9.032258064516129e-07, + "loss": 0.0, + "num_tokens": 461076.0, + "reward": 1.0, + "reward_std": 0.6172134280204773, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 471.875, + "completions/mean_terminated_length": 471.875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.009407858328721638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.00013033490677116788, + "learning_rate": 9.216589861751154e-07, + "loss": 0.0, + "num_tokens": 470219.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 551.5, + "completions/mean_terminated_length": 551.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.009592326139088728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78125, + "kl": 0.00011888523840752896, + "learning_rate": 9.400921658986175e-07, + "loss": 0.0, + "num_tokens": 481007.0, + "reward": 1.0029070377349854, + "reward_std": 0.53458571434021, + "rewards/fixed_code_pass_all_test_reward/mean": 0.12790697813034058, + "rewards/fixed_code_pass_all_test_reward/std": 0.3524727523326874, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 273.625, + "completions/mean_terminated_length": 273.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.00977679394945582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.0001700549510132987, + "learning_rate": 9.5852534562212e-07, + "loss": 0.0, + "num_tokens": 486060.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 519.625, + "completions/mean_terminated_length": 519.625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.00996126175982291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.00022266689484240487, + "learning_rate": 9.769585253456222e-07, + "loss": 0.0, + "num_tokens": 496705.0, + "reward": 0.9015151262283325, + "reward_std": 0.5400769114494324, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1515151560306549, + "rewards/fixed_code_pass_all_test_reward/std": 0.24511492252349854, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 676.0, + "completions/mean_terminated_length": 676.0, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.010145729570190002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.00014531454917232622, + "learning_rate": 9.953917050691246e-07, + "loss": 0.0, + "num_tokens": 511441.0, + "reward": 0.9711538553237915, + "reward_std": 0.5999965071678162, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2211538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.1390555202960968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 654.5, + "completions/mean_terminated_length": 455.4285888671875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.010330197380557092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.00017269806494368822, + "learning_rate": 1.013824884792627e-06, + "loss": 0.0, + "num_tokens": 523501.0, + "reward": 1.139925479888916, + "reward_std": 0.6464568972587585, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5149253606796265, + "rewards/fixed_code_pass_all_test_reward/std": 0.31781885027885437, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 442.0, + "completions/mean_terminated_length": 442.0, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.010514665190924184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.00022005929895385634, + "learning_rate": 1.0322580645161291e-06, + "loss": 0.0, + "num_tokens": 534421.0, + "reward": 1.4196429252624512, + "reward_std": 0.5780074000358582, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6696428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.3534245193004608, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.010699133001291274, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.00013272725936985807, + "learning_rate": 1.0506912442396313e-06, + "loss": 0.0, + "num_tokens": 538620.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 246.875, + "completions/mean_terminated_length": 246.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.010883600811658366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.00011469083437987138, + "learning_rate": 1.0691244239631337e-06, + "loss": 0.0, + "num_tokens": 543435.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 698.75, + "completions/mean_terminated_length": 249.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.011068068622025456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.00024947148813225795, + "learning_rate": 1.087557603686636e-06, + "loss": 0.0, + "num_tokens": 552041.0, + "reward": 0.875, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1383.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 552.875, + "completions/mean_terminated_length": 552.875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.011252536432392548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.0002478203441569349, + "learning_rate": 1.1059907834101384e-06, + "loss": 0.0, + "num_tokens": 561000.0, + "reward": 0.8541666865348816, + "reward_std": 0.6135863065719604, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.4833538830280304, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.011437004242759638, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.00011564837814148632, + "learning_rate": 1.1244239631336406e-06, + "loss": 0.0, + "num_tokens": 565241.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 347.5, + "completions/mean_terminated_length": 347.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.01162147205312673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.00023581041023135185, + "learning_rate": 1.142857142857143e-06, + "loss": 0.0, + "num_tokens": 571197.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 323.0, + "completions/mean_terminated_length": 323.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.01180593986349382, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6875, + "kl": 0.000263547352005844, + "learning_rate": 1.1612903225806454e-06, + "loss": 0.0, + "num_tokens": 581405.0, + "reward": 0.8928571343421936, + "reward_std": 0.3642157018184662, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 238.0, + "completions/mean_terminated_length": 238.0, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.011990407673860911, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.0001780624866114522, + "learning_rate": 1.1797235023041475e-06, + "loss": 0.0, + "num_tokens": 588253.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 213.75, + "completions/mean_terminated_length": 213.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.012174875484228001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.00018533837464929093, + "learning_rate": 1.19815668202765e-06, + "loss": 0.0, + "num_tokens": 592787.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 692.375, + "completions/mean_terminated_length": 692.375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.012359343294595093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.00023742977191432146, + "learning_rate": 1.2165898617511523e-06, + "loss": 0.0, + "num_tokens": 607598.0, + "reward": 0.800000011920929, + "reward_std": 0.7406560778617859, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17499999701976776, + "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 633.375, + "completions/mean_terminated_length": 633.375, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.012543811104962183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.00013511399265553337, + "learning_rate": 1.2350230414746545e-06, + "loss": 0.0, + "num_tokens": 617577.0, + "reward": 0.9166667461395264, + "reward_std": 0.8309490084648132, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.4154745042324066, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 399.0, + "completions/mean_terminated_length": 399.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.012728278915329275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004486083984375, + "kl": 0.00023202788543130737, + "learning_rate": 1.2534562211981569e-06, + "loss": 0.0, + "num_tokens": 629769.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1956.0, + "completions/max_terminated_length": 1956.0, + "completions/mean_length": 616.5, + "completions/mean_terminated_length": 616.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.012912746725696367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.00015789896042406326, + "learning_rate": 1.271889400921659e-06, + "loss": 0.0, + "num_tokens": 641781.0, + "reward": 0.7708333730697632, + "reward_std": 0.46876654028892517, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, + "rewards/fixed_code_pass_all_test_reward/std": 0.03857583925127983, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 423.625, + "completions/mean_terminated_length": 191.57144165039062, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.013097214536063457, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.00016825743887238787, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.0, + "num_tokens": 648842.0, + "reward": 1.25, + "reward_std": 1.0350983142852783, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 356.125, + "completions/mean_terminated_length": 356.125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.013281682346430549, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.00022476712092611706, + "learning_rate": 1.3087557603686638e-06, + "loss": 0.0, + "num_tokens": 655355.0, + "reward": 0.7000000476837158, + "reward_std": 0.4820590913295746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, + "rewards/fixed_code_pass_all_test_reward/std": 0.29113897681236267, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 652.0, + "completions/mean_terminated_length": 452.5714416503906, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.013466150156797639, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.75, + "kl": 0.0002261804784211563, + "learning_rate": 1.3271889400921662e-06, + "loss": 0.0, + "num_tokens": 666403.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 148.125, + "completions/mean_terminated_length": 148.125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.01365061796716473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.00017740407565725036, + "learning_rate": 1.3456221198156683e-06, + "loss": 0.0, + "num_tokens": 670388.0, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.01383508577753182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.00030659233379992656, + "learning_rate": 1.3640552995391707e-06, + "loss": 0.0, + "num_tokens": 675493.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 555.0, + "completions/mean_terminated_length": 555.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.014019553587898912, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.00023349323964794166, + "learning_rate": 1.382488479262673e-06, + "loss": 0.0, + "num_tokens": 688861.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1309.0, + "completions/max_terminated_length": 1309.0, + "completions/mean_length": 938.125, + "completions/mean_terminated_length": 938.125, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.014204021398266002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.0003092967308475636, + "learning_rate": 1.400921658986175e-06, + "loss": 0.0, + "num_tokens": 703574.0, + "reward": 0.3822115361690521, + "reward_std": 0.5278024077415466, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0072115384973585606, + "rewards/fixed_code_pass_all_test_reward/std": 0.020397311076521873, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 387.625, + "completions/mean_terminated_length": 387.625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.014388489208633094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.00021497006855497602, + "learning_rate": 1.4193548387096776e-06, + "loss": 0.0, + "num_tokens": 715083.0, + "reward": 1.25, + "reward_std": 0.6306403279304504, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.40689605474472046, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 612.375, + "completions/mean_terminated_length": 407.2857360839844, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.014572957019000184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.0001775709733919939, + "learning_rate": 1.43778801843318e-06, + "loss": 0.0, + "num_tokens": 726710.0, + "reward": 0.9615384340286255, + "reward_std": 0.649468183517456, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21153846383094788, + "rewards/fixed_code_pass_all_test_reward/std": 0.3295787572860718, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 399.125, + "completions/mean_terminated_length": 399.125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.014757424829367276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.00019801928829110693, + "learning_rate": 1.456221198156682e-06, + "loss": 0.0, + "num_tokens": 734071.0, + "reward": 0.38749998807907104, + "reward_std": 0.5185625553131104, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01249999925494194, + "rewards/fixed_code_pass_all_test_reward/std": 0.018322506919503212, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 540.625, + "completions/mean_terminated_length": 540.625, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.014941892639734366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.000200824973944691, + "learning_rate": 1.4746543778801844e-06, + "loss": 0.0, + "num_tokens": 743548.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 781.0, + "completions/mean_terminated_length": 781.0, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.015126360450101458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.00017313397802354302, + "learning_rate": 1.4930875576036868e-06, + "loss": 0.0, + "num_tokens": 755860.0, + "reward": 0.8035714626312256, + "reward_std": 0.5286955237388611, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1785714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1080.0, + "completions/mean_length": 837.875, + "completions/mean_terminated_length": 665.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.015310828260468548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.0002544065773690818, + "learning_rate": 1.511520737327189e-06, + "loss": 0.0, + "num_tokens": 770435.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 385.0, + "completions/mean_terminated_length": 385.0, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.01549529607083564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.000220081898078206, + "learning_rate": 1.5299539170506913e-06, + "loss": 0.0, + "num_tokens": 780955.0, + "reward": 1.170454502105713, + "reward_std": 0.5152629017829895, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.2368127703666687, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 445.625, + "completions/mean_terminated_length": 445.625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.01567976388120273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.00019480635637592059, + "learning_rate": 1.5483870967741937e-06, + "loss": 0.0, + "num_tokens": 793088.0, + "reward": 1.1689815521240234, + "reward_std": 0.17608249187469482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.16898147761821747, + "rewards/fixed_code_pass_all_test_reward/std": 0.17608249187469482, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 296.625, + "completions/mean_terminated_length": 296.625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.01586423169156982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.00016720175563023076, + "learning_rate": 1.5668202764976959e-06, + "loss": 0.0, + "num_tokens": 798829.0, + "reward": 1.40625, + "reward_std": 0.9057110548019409, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 168.75, + "completions/mean_terminated_length": 168.75, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.016048699501936912, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.0005037817518314114, + "learning_rate": 1.5852534562211982e-06, + "loss": 0.0, + "num_tokens": 802827.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 340.5, + "completions/mean_terminated_length": 340.5, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.016233167312304002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005035400390625, + "kl": 0.00017259119431400904, + "learning_rate": 1.6036866359447006e-06, + "loss": 0.0, + "num_tokens": 809255.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 489.625, + "completions/mean_terminated_length": 489.625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.016417635122671095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.00023493254229833838, + "learning_rate": 1.622119815668203e-06, + "loss": 0.0, + "num_tokens": 818868.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 640.5, + "completions/mean_terminated_length": 640.5, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.016602102933038185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.0002694915783649776, + "learning_rate": 1.6405529953917052e-06, + "loss": 0.0, + "num_tokens": 832872.0, + "reward": 0.8684210181236267, + "reward_std": 0.5631824731826782, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, + "rewards/fixed_code_pass_all_test_reward/std": 0.18766893446445465, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 422.0, + "completions/mean_terminated_length": 422.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.016786570743405275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.00021502541858353652, + "learning_rate": 1.6589861751152075e-06, + "loss": 0.0, + "num_tokens": 843376.0, + "reward": 1.5104167461395264, + "reward_std": 0.4552112817764282, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6354166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.4317220449447632, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1243.0, + "completions/max_terminated_length": 1243.0, + "completions/mean_length": 618.75, + "completions/mean_terminated_length": 618.75, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.016971038553772366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.86328125, + "kl": 0.00017798052704165457, + "learning_rate": 1.67741935483871e-06, + "loss": 0.0, + "num_tokens": 857310.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 325.0, + "completions/mean_terminated_length": 325.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.01715550636413946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.00022852111760585103, + "learning_rate": 1.695852534562212e-06, + "loss": 0.0, + "num_tokens": 862846.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 493.0, + "completions/mean_terminated_length": 493.0, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.01733997417450655, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0002933645519078709, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.0, + "num_tokens": 871038.0, + "reward": 0.7307692170143127, + "reward_std": 0.6435846090316772, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23076924681663513, + "rewards/fixed_code_pass_all_test_reward/std": 0.358450710773468, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 456.625, + "completions/mean_terminated_length": 456.625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.01752444198487364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0002652740322446334, + "learning_rate": 1.7327188940092169e-06, + "loss": 0.0, + "num_tokens": 879763.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 282.125, + "completions/mean_terminated_length": 282.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.01770890979524073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.00019635578155430267, + "learning_rate": 1.751152073732719e-06, + "loss": 0.0, + "num_tokens": 889212.0, + "reward": 1.0999999046325684, + "reward_std": 0.45669618248939514, + "rewards/fixed_code_pass_all_test_reward/mean": 0.22500000894069672, + "rewards/fixed_code_pass_all_test_reward/std": 0.13887302577495575, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 749.25, + "completions/mean_terminated_length": 563.7142944335938, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.017893377605607823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.00024798521008051466, + "learning_rate": 1.7695852534562214e-06, + "loss": 0.0, + "num_tokens": 903390.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 404.5, + "completions/mean_terminated_length": 404.5, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.018077845415974913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.00027365733876649756, + "learning_rate": 1.7880184331797238e-06, + "loss": 0.0, + "num_tokens": 909402.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1007.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 477.625, + "completions/mean_terminated_length": 477.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.018262313226342003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.0002560757529863622, + "learning_rate": 1.8064516129032258e-06, + "loss": 0.0, + "num_tokens": 918223.0, + "reward": 1.1370967626571655, + "reward_std": 0.7183694839477539, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5120967626571655, + "rewards/fixed_code_pass_all_test_reward/std": 0.35017356276512146, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 288.625, + "completions/mean_terminated_length": 288.625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.018446781036709093, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.0003022496966877952, + "learning_rate": 1.8248847926267283e-06, + "loss": 0.0, + "num_tokens": 923452.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 353.375, + "completions/mean_terminated_length": 353.375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.018631248847076187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.000247218271397287, + "learning_rate": 1.8433179723502307e-06, + "loss": 0.0, + "num_tokens": 931719.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 404.375, + "completions/mean_terminated_length": 404.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.018815716657443277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0002791819078993285, + "learning_rate": 1.8617511520737327e-06, + "loss": 0.0, + "num_tokens": 941634.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 460.25, + "completions/mean_terminated_length": 460.25, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.019000184467810367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.00025024179558386095, + "learning_rate": 1.880184331797235e-06, + "loss": 0.0, + "num_tokens": 949820.0, + "reward": 0.90625, + "reward_std": 0.6179162263870239, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, + "rewards/fixed_code_pass_all_test_reward/std": 0.27973026037216187, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 300.75, + "completions/mean_terminated_length": 300.75, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.019184652278177457, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.00030522290489898296, + "learning_rate": 1.8986175115207374e-06, + "loss": 0.0, + "num_tokens": 955074.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.01936912008854455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.0002908794067479903, + "learning_rate": 1.91705069124424e-06, + "loss": 0.0, + "num_tokens": 960924.0, + "reward": 1.3125, + "reward_std": 0.6868232488632202, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.47087812423706055, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1988.0, + "completions/max_terminated_length": 1988.0, + "completions/mean_length": 577.625, + "completions/mean_terminated_length": 577.625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.01955358789891164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.00024830274196574464, + "learning_rate": 1.935483870967742e-06, + "loss": 0.0, + "num_tokens": 968777.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1167.0, + "completions/max_terminated_length": 1167.0, + "completions/mean_length": 659.875, + "completions/mean_terminated_length": 659.875, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.01973805570927873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.0002435520655126311, + "learning_rate": 1.9539170506912444e-06, + "loss": 0.0, + "num_tokens": 981424.0, + "reward": 1.3068182468414307, + "reward_std": 0.4262283742427826, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5568181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.37776920199394226, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 424.75, + "completions/mean_terminated_length": 424.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.01992252351964582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.00024267719345516525, + "learning_rate": 1.9723502304147468e-06, + "loss": 0.0, + "num_tokens": 991270.0, + "reward": 1.2890625, + "reward_std": 0.7052794694900513, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5390625, + "rewards/fixed_code_pass_all_test_reward/std": 0.286776065826416, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 466.5, + "completions/mean_terminated_length": 466.5, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.020106991330012914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.00026080430689034984, + "learning_rate": 1.990783410138249e-06, + "loss": 0.0, + "num_tokens": 1002474.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 377.875, + "completions/mean_terminated_length": 377.875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.020291459140380004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0002325564810234937, + "learning_rate": 2.0092165898617515e-06, + "loss": 0.0, + "num_tokens": 1009785.0, + "reward": 1.4285714626312256, + "reward_std": 0.7284314036369324, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.38132426142692566, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 449.625, + "completions/mean_terminated_length": 449.625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.020475926950747094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.00016853617398737697, + "learning_rate": 2.027649769585254e-06, + "loss": 0.0, + "num_tokens": 1019646.0, + "reward": 1.2374999523162842, + "reward_std": 0.61105877161026, + "rewards/fixed_code_pass_all_test_reward/mean": 0.36250001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.38055410981178284, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1135.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 464.875, + "completions/mean_terminated_length": 464.875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.020660394761114184, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.000360059981176164, + "learning_rate": 2.046082949308756e-06, + "loss": 0.0, + "num_tokens": 1029933.0, + "reward": 0.836538553237915, + "reward_std": 0.6231482028961182, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21153846383094788, + "rewards/fixed_code_pass_all_test_reward/std": 0.16446846723556519, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 616.625, + "completions/mean_terminated_length": 616.625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.020844862571481278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.00024101105464069406, + "learning_rate": 2.0645161290322582e-06, + "loss": 0.0, + "num_tokens": 1045258.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 512.5, + "completions/mean_terminated_length": 512.5, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.021029330381848368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0003414558359509101, + "learning_rate": 2.0829493087557606e-06, + "loss": 0.0, + "num_tokens": 1053870.0, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 575.75, + "completions/mean_terminated_length": 575.75, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.021213798192215458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.00022292660651146434, + "learning_rate": 2.1013824884792626e-06, + "loss": 0.0, + "num_tokens": 1065236.0, + "reward": 0.9375, + "reward_std": 0.6232117414474487, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 486.375, + "completions/mean_terminated_length": 486.375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.021398266002582548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.00018211517999588978, + "learning_rate": 2.119815668202765e-06, + "loss": 0.0, + "num_tokens": 1076815.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 311.625, + "completions/mean_terminated_length": 311.625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.02158273381294964, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.0004432215409906348, + "learning_rate": 2.1382488479262673e-06, + "loss": 0.0, + "num_tokens": 1082892.0, + "reward": 1.7083333730697632, + "reward_std": 0.7000566720962524, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.35634833574295044, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 400.875, + "completions/mean_terminated_length": 400.875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.02176720162331673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.000386988684113021, + "learning_rate": 2.1566820276497697e-06, + "loss": 0.0, + "num_tokens": 1091219.0, + "reward": 0.512499988079071, + "reward_std": 0.49407199025154114, + "rewards/fixed_code_pass_all_test_reward/mean": 0.26249998807907104, + "rewards/fixed_code_pass_all_test_reward/std": 0.1922609806060791, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.02195166943368382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.00035162228959961794, + "learning_rate": 2.175115207373272e-06, + "loss": 0.0, + "num_tokens": 1096573.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 591.875, + "completions/mean_terminated_length": 591.875, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.02213613724405091, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.109375, + "kl": 0.00040626297595736105, + "learning_rate": 2.1935483870967745e-06, + "loss": 0.0, + "num_tokens": 1107100.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 340.875, + "completions/mean_terminated_length": 340.875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.022320605054418005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.00030954191788623575, + "learning_rate": 2.211981566820277e-06, + "loss": 0.0, + "num_tokens": 1114083.0, + "reward": 0.980555534362793, + "reward_std": 0.4398873448371887, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10555555671453476, + "rewards/fixed_code_pass_all_test_reward/std": 0.19581152498722076, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 519.125, + "completions/mean_terminated_length": 519.125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.022505072864785095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.00022163750509207603, + "learning_rate": 2.230414746543779e-06, + "loss": 0.0, + "num_tokens": 1124988.0, + "reward": 0.9471153616905212, + "reward_std": 0.38622599840164185, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07211538404226303, + "rewards/fixed_code_pass_all_test_reward/std": 0.05971721187233925, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1349.0, + "completions/max_terminated_length": 1349.0, + "completions/mean_length": 523.125, + "completions/mean_terminated_length": 523.125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.022689540675152185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.00034853070246754214, + "learning_rate": 2.248847926267281e-06, + "loss": 0.0, + "num_tokens": 1137429.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 460.0, + "completions/mean_terminated_length": 460.0, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.022874008485519275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.0002755507011897862, + "learning_rate": 2.2672811059907836e-06, + "loss": 0.0, + "num_tokens": 1149077.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 228.125, + "completions/mean_terminated_length": 228.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.02305847629588637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0004555258601612877, + "learning_rate": 2.285714285714286e-06, + "loss": 0.0, + "num_tokens": 1153878.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 447.25, + "completions/mean_terminated_length": 447.25, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.02324294410625346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.0002706130308069987, + "learning_rate": 2.3041474654377884e-06, + "loss": 0.0, + "num_tokens": 1165792.0, + "reward": 1.860576868057251, + "reward_std": 0.35006189346313477, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9855769276618958, + "rewards/fixed_code_pass_all_test_reward/std": 0.04079463332891464, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 728.5, + "completions/mean_terminated_length": 540.0, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.02342741191662055, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.00026130559672310483, + "learning_rate": 2.3225806451612907e-06, + "loss": 0.0, + "num_tokens": 1179604.0, + "reward": 0.875, + "reward_std": 0.4432026147842407, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 435.5, + "completions/mean_terminated_length": 435.5, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.02361187972698764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.000289637715468416, + "learning_rate": 2.3410138248847927e-06, + "loss": 0.0, + "num_tokens": 1187752.0, + "reward": 0.942307710647583, + "reward_std": 0.4461728036403656, + "rewards/fixed_code_pass_all_test_reward/mean": 0.19230769574642181, + "rewards/fixed_code_pass_all_test_reward/std": 0.08223423361778259, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 515.5, + "completions/mean_terminated_length": 515.5, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.023796347537354733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.00040351709321839735, + "learning_rate": 2.359447004608295e-06, + "loss": 0.0, + "num_tokens": 1197380.0, + "reward": 0.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 523.5, + "completions/mean_terminated_length": 523.5, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.023980815347721823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.0003619024882937083, + "learning_rate": 2.3778801843317975e-06, + "loss": 0.0, + "num_tokens": 1208256.0, + "reward": 1.0, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 305.0, + "completions/mean_terminated_length": 305.0, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.024165283158088913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0002477585449014441, + "learning_rate": 2.3963133640553e-06, + "loss": 0.0, + "num_tokens": 1216512.0, + "reward": 1.7357953786849976, + "reward_std": 0.2032562494277954, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.20325623452663422, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 342.25, + "completions/mean_terminated_length": 342.25, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.024349750968456003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.0007874341190472478, + "learning_rate": 2.4147465437788022e-06, + "loss": 0.0, + "num_tokens": 1222186.0, + "reward": 0.875, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1685.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 663.125, + "completions/mean_terminated_length": 663.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.024534218778823096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.00027552976871447754, + "learning_rate": 2.4331797235023046e-06, + "loss": 0.0, + "num_tokens": 1233787.0, + "reward": 1.242347002029419, + "reward_std": 0.8792401552200317, + "rewards/fixed_code_pass_all_test_reward/mean": 0.49234694242477417, + "rewards/fixed_code_pass_all_test_reward/std": 0.5267224311828613, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1671.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 700.0, + "completions/mean_terminated_length": 700.0, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.024718686589190186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.0003037431961274706, + "learning_rate": 2.4516129032258066e-06, + "loss": 0.0, + "num_tokens": 1248771.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1730.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 756.75, + "completions/mean_terminated_length": 756.75, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.024903154399557276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.0002866925542548415, + "learning_rate": 2.470046082949309e-06, + "loss": 0.0, + "num_tokens": 1265217.0, + "reward": 0.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 585.25, + "completions/mean_terminated_length": 585.25, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.025087622209924366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.00031472905902774073, + "learning_rate": 2.4884792626728113e-06, + "loss": 0.0, + "num_tokens": 1276923.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 489.75, + "completions/mean_terminated_length": 489.75, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.02527209002029146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.00031833308094064705, + "learning_rate": 2.5069124423963137e-06, + "loss": 0.0, + "num_tokens": 1286377.0, + "reward": 0.5625, + "reward_std": 0.5469068884849548, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 505.25, + "completions/mean_terminated_length": 505.25, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.02545655783065855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.0002574691297922982, + "learning_rate": 2.5253456221198157e-06, + "loss": 0.0, + "num_tokens": 1297707.0, + "reward": 1.4267241954803467, + "reward_std": 0.5558459162712097, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6767241954803467, + "rewards/fixed_code_pass_all_test_reward/std": 0.229797825217247, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 359.0, + "completions/mean_terminated_length": 359.0, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.02564102564102564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.00028419451518857386, + "learning_rate": 2.543778801843318e-06, + "loss": 0.0, + "num_tokens": 1304571.0, + "reward": 1.8888888359069824, + "reward_std": 0.20573778450489044, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 566.875, + "completions/mean_terminated_length": 566.875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.025825493451392734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.0004287885094527155, + "learning_rate": 2.5622119815668204e-06, + "loss": 0.0, + "num_tokens": 1315930.0, + "reward": 1.4293478727340698, + "reward_std": 0.1879972666501999, + "rewards/fixed_code_pass_all_test_reward/mean": 0.42934781312942505, + "rewards/fixed_code_pass_all_test_reward/std": 0.1879972517490387, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 210.125, + "completions/mean_terminated_length": 210.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.026009961261759824, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.00028534480225062, + "learning_rate": 2.580645161290323e-06, + "loss": 0.0, + "num_tokens": 1320547.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 434.0, + "completions/mean_terminated_length": 434.0, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.026194429072126914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.0003331211682962021, + "learning_rate": 2.5990783410138248e-06, + "loss": 0.0, + "num_tokens": 1328979.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 364.0, + "completions/mean_terminated_length": 364.0, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.026378896882494004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.0006832111193944002, + "learning_rate": 2.6175115207373276e-06, + "loss": 0.0, + "num_tokens": 1337539.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 188.0, + "completions/mean_terminated_length": 188.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.026563364692861097, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.00036721180731547065, + "learning_rate": 2.6359447004608295e-06, + "loss": 0.0, + "num_tokens": 1342483.0, + "reward": 1.1190476417541504, + "reward_std": 0.1272672861814499, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1190476194024086, + "rewards/fixed_code_pass_all_test_reward/std": 0.1272672712802887, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 193.5, + "completions/mean_terminated_length": 193.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.026747832503228187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0006242797389859334, + "learning_rate": 2.6543778801843323e-06, + "loss": 0.0, + "num_tokens": 1346807.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 633.625, + "completions/mean_terminated_length": 431.5714416503906, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.026932300313595278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.0005374398151616333, + "learning_rate": 2.6728110599078343e-06, + "loss": 0.0, + "num_tokens": 1360412.0, + "reward": 1.1335227489471436, + "reward_std": 0.8332343697547913, + "rewards/fixed_code_pass_all_test_reward/mean": 0.38352271914482117, + "rewards/fixed_code_pass_all_test_reward/std": 0.5107228755950928, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 395.0, + "completions/mean_terminated_length": 395.0, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.027116768123962368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.0002991663513967069, + "learning_rate": 2.6912442396313367e-06, + "loss": 0.0, + "num_tokens": 1369804.0, + "reward": 1.6749999523162842, + "reward_std": 0.36936238408088684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.36936238408088684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 217.25, + "completions/mean_terminated_length": 217.25, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.02730123593432946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0004839637695113197, + "learning_rate": 2.709677419354839e-06, + "loss": 0.0, + "num_tokens": 1374438.0, + "reward": 0.75, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 129.375, + "completions/mean_terminated_length": 129.375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.02748570374469655, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.265625, + "kl": 0.00085990996012697, + "learning_rate": 2.7281105990783414e-06, + "loss": 0.0, + "num_tokens": 1378289.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 465.25, + "completions/mean_terminated_length": 465.25, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.02767017155506364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0007587236941617448, + "learning_rate": 2.7465437788018434e-06, + "loss": 0.0, + "num_tokens": 1386859.0, + "reward": 0.40625, + "reward_std": 0.4988826811313629, + "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 387.625, + "completions/mean_terminated_length": 387.625, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.02785463936543073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.0003651740862551378, + "learning_rate": 2.764976958525346e-06, + "loss": 0.0, + "num_tokens": 1397880.0, + "reward": 0.9038461446762085, + "reward_std": 0.36948251724243164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02884615585207939, + "rewards/fixed_code_pass_all_test_reward/std": 0.057232603430747986, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 480.0, + "completions/mean_terminated_length": 480.0, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.028039107175797825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.00037873095425311476, + "learning_rate": 2.783410138248848e-06, + "loss": 0.0, + "num_tokens": 1410232.0, + "reward": 1.2727272510528564, + "reward_std": 0.5233621001243591, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39772725105285645, + "rewards/fixed_code_pass_all_test_reward/std": 0.43716782331466675, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 204.75, + "completions/mean_terminated_length": 204.75, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.028223574986164915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.0011528825780260377, + "learning_rate": 2.80184331797235e-06, + "loss": 0.0, + "num_tokens": 1414798.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 310.5, + "completions/mean_terminated_length": 310.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.028408042796532005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.0006661412990069948, + "learning_rate": 2.820276497695853e-06, + "loss": 0.0, + "num_tokens": 1423210.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 768.5, + "completions/mean_terminated_length": 768.5, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "epoch": 0.028592510606899095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71875, + "kl": 0.00028987093992327573, + "learning_rate": 2.8387096774193553e-06, + "loss": 0.0, + "num_tokens": 1436398.0, + "reward": 1.3125, + "reward_std": 0.39426735043525696, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3005340099334717, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 865.375, + "completions/mean_terminated_length": 696.4285888671875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.02877697841726619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.00040699875353311654, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0, + "num_tokens": 1451065.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 554.75, + "completions/mean_terminated_length": 554.75, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.02896144622763328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.0004439274580363417, + "learning_rate": 2.87557603686636e-06, + "loss": 0.0, + "num_tokens": 1461271.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 483.875, + "completions/mean_terminated_length": 483.875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.02914591403800037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.0003580423963285284, + "learning_rate": 2.894009216589862e-06, + "loss": 0.0, + "num_tokens": 1471838.0, + "reward": 1.1781609058380127, + "reward_std": 0.3656545579433441, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3031609356403351, + "rewards/fixed_code_pass_all_test_reward/std": 0.36817196011543274, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 600.5, + "completions/mean_terminated_length": 393.71429443359375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.02933038184836746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.0002662021815922344, + "learning_rate": 2.912442396313364e-06, + "loss": 0.0, + "num_tokens": 1482482.0, + "reward": 1.2857143878936768, + "reward_std": 0.6325119733810425, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.39714542031288147, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 664.125, + "completions/mean_terminated_length": 466.4285888671875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.029514849658734552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.0005041680597059894, + "learning_rate": 2.9308755760368668e-06, + "loss": 0.0, + "num_tokens": 1494515.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 604.5, + "completions/mean_terminated_length": 604.5, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.029699317469101642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0005147759948158637, + "learning_rate": 2.9493087557603687e-06, + "loss": 0.0, + "num_tokens": 1505223.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 261.75, + "completions/mean_terminated_length": 261.75, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.029883785279468732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0009588434695615433, + "learning_rate": 2.967741935483871e-06, + "loss": 0.0, + "num_tokens": 1511101.0, + "reward": 0.511363685131073, + "reward_std": 0.5118144154548645, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, + "rewards/fixed_code_pass_all_test_reward/std": 0.08416546881198883, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 234.75, + "completions/mean_terminated_length": 234.75, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.030068253089835822, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.0004337240316090174, + "learning_rate": 2.9861751152073735e-06, + "loss": 0.0, + "num_tokens": 1517459.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 386.75, + "completions/mean_terminated_length": 386.75, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.030252720900202916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.000409138987379265, + "learning_rate": 3.004608294930876e-06, + "loss": 0.0, + "num_tokens": 1524921.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 525.5, + "completions/mean_terminated_length": 525.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.030437188710570006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.00037674761551897973, + "learning_rate": 3.023041474654378e-06, + "loss": 0.0, + "num_tokens": 1536117.0, + "reward": 1.2666666507720947, + "reward_std": 0.881557047367096, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5166666507720947, + "rewards/fixed_code_pass_all_test_reward/std": 0.5173191428184509, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 586.75, + "completions/mean_terminated_length": 586.75, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.030621656520937096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.0005971487025817623, + "learning_rate": 3.0414746543778806e-06, + "loss": 0.0, + "num_tokens": 1546355.0, + "reward": 1.03125, + "reward_std": 0.5597928762435913, + "rewards/fixed_code_pass_all_test_reward/mean": 0.28125, + "rewards/fixed_code_pass_all_test_reward/std": 0.1833198070526123, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 508.0, + "completions/mean_terminated_length": 508.0, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.030806124331304186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.0007573354814667255, + "learning_rate": 3.0599078341013826e-06, + "loss": 0.0, + "num_tokens": 1556139.0, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.03099059214167128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0004650029350159457, + "learning_rate": 3.078341013824885e-06, + "loss": 0.0, + "num_tokens": 1560921.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 941.875, + "completions/mean_terminated_length": 783.857177734375, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.03117505995203837, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.00032446978366351686, + "learning_rate": 3.0967741935483874e-06, + "loss": 0.0, + "num_tokens": 1575536.0, + "reward": 0.9453125, + "reward_std": 0.6599068641662598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.33103513717651367, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 309.0, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.03135952776240546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.0011351883367751725, + "learning_rate": 3.1152073732718897e-06, + "loss": 0.0, + "num_tokens": 1581752.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 447.375, + "completions/mean_terminated_length": 447.375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.03154399557277255, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.0008446653191640507, + "learning_rate": 3.1336405529953917e-06, + "loss": 0.0, + "num_tokens": 1591875.0, + "reward": 1.4375, + "reward_std": 0.6603119969367981, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.38768237829208374, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.03172846338313964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0004901043084828416, + "learning_rate": 3.1520737327188945e-06, + "loss": 0.0, + "num_tokens": 1596970.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 386.5, + "completions/mean_terminated_length": 386.5, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.03191293119350673, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.002058852749541984, + "learning_rate": 3.1705069124423965e-06, + "loss": 0.0001, + "num_tokens": 1603902.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 927.75, + "completions/mean_terminated_length": 554.3333740234375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.032097399003873824, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.0007004580547800288, + "learning_rate": 3.1889400921658984e-06, + "loss": 0.0, + "num_tokens": 1623460.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 477.125, + "completions/mean_terminated_length": 477.125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.03228186681424092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.0004310562726459466, + "learning_rate": 3.2073732718894012e-06, + "loss": 0.0, + "num_tokens": 1635029.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1568.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 693.0, + "completions/mean_terminated_length": 693.0, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.032466334624608004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.0006872646517877001, + "learning_rate": 3.225806451612903e-06, + "loss": 0.0, + "num_tokens": 1645349.0, + "reward": 1.1875, + "reward_std": 0.5303300619125366, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 507.625, + "completions/mean_terminated_length": 507.625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.0326508024349751, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.0011440341331763193, + "learning_rate": 3.244239631336406e-06, + "loss": 0.0, + "num_tokens": 1653970.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.03283527024534219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0096435546875, + "kl": 0.0006141647609183565, + "learning_rate": 3.2626728110599084e-06, + "loss": 0.0, + "num_tokens": 1662603.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 504.75, + "completions/mean_terminated_length": 504.75, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.03301973805570928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.0007241026942210738, + "learning_rate": 3.2811059907834103e-06, + "loss": 0.0, + "num_tokens": 1674569.0, + "reward": 1.0277777910232544, + "reward_std": 0.4474107325077057, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2777777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.13280318677425385, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 348.0, + "completions/mean_terminated_length": 348.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.03320420586607637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0005435090661194408, + "learning_rate": 3.299539170506913e-06, + "loss": 0.0, + "num_tokens": 1680073.0, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 429.125, + "completions/mean_terminated_length": 429.125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.03338867367644346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.0008517967544321436, + "learning_rate": 3.317972350230415e-06, + "loss": 0.0, + "num_tokens": 1688330.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 426.125, + "completions/mean_terminated_length": 426.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.03357314148681055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0007898740223026834, + "learning_rate": 3.336405529953917e-06, + "loss": 0.0, + "num_tokens": 1698747.0, + "reward": 0.9451218843460083, + "reward_std": 0.39127692580223083, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07012195140123367, + "rewards/fixed_code_pass_all_test_reward/std": 0.0897931158542633, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 441.25, + "completions/mean_terminated_length": 441.25, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.033757609297177645, + "frac_reward_zero_std": 0.0, + "grad_norm": 37.25, + "kl": 0.004780470324476482, + "learning_rate": 3.35483870967742e-06, + "loss": 0.0002, + "num_tokens": 1710133.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 259.125, + "completions/mean_terminated_length": 259.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.03394207710754473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0006970599461055826, + "learning_rate": 3.373271889400922e-06, + "loss": 0.0, + "num_tokens": 1715054.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 397.75, + "completions/mean_terminated_length": 397.75, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.034126544917911825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.00037298203278623987, + "learning_rate": 3.391705069124424e-06, + "loss": 0.0, + "num_tokens": 1722196.0, + "reward": 1.28125, + "reward_std": 0.6967719197273254, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4759858250617981, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 414.5, + "completions/mean_terminated_length": 414.5, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.03431101272827892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.0009564059146214277, + "learning_rate": 3.4101382488479266e-06, + "loss": 0.0, + "num_tokens": 1730192.0, + "reward": 0.8088235259056091, + "reward_std": 0.4387890696525574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05882352963089943, + "rewards/fixed_code_pass_all_test_reward/std": 0.10892001539468765, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1152.0, + "completions/max_terminated_length": 1152.0, + "completions/mean_length": 556.75, + "completions/mean_terminated_length": 556.75, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.034495480538646005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.0005112210856168531, + "learning_rate": 3.428571428571429e-06, + "loss": 0.0, + "num_tokens": 1739518.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 380.75, + "completions/mean_terminated_length": 380.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.0346799483490131, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.000551451896171784, + "learning_rate": 3.447004608294931e-06, + "loss": 0.0, + "num_tokens": 1748572.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 404.375, + "completions/mean_terminated_length": 404.375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.034864416159380185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.0009702303104859311, + "learning_rate": 3.4654377880184337e-06, + "loss": 0.0, + "num_tokens": 1758527.0, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 668.375, + "completions/mean_terminated_length": 668.375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.03504888396974728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.0005567252701439429, + "learning_rate": 3.4838709677419357e-06, + "loss": 0.0, + "num_tokens": 1772154.0, + "reward": 0.8125, + "reward_std": 0.5303300619125366, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 390.375, + "completions/mean_terminated_length": 390.375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.03523335178011437, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.000561461471079383, + "learning_rate": 3.502304147465438e-06, + "loss": 0.0, + "num_tokens": 1778469.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 478.75, + "completions/mean_terminated_length": 478.75, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.03541781959048146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.0013180634887248743, + "learning_rate": 3.5207373271889404e-06, + "loss": 0.0001, + "num_tokens": 1789611.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 536.875, + "completions/mean_terminated_length": 536.875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.03560228740084855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00701904296875, + "kl": 0.0004805325843335595, + "learning_rate": 3.539170506912443e-06, + "loss": 0.0, + "num_tokens": 1799818.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 204.375, + "completions/mean_terminated_length": 204.375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.035786755211215646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0009351367934868904, + "learning_rate": 3.5576036866359448e-06, + "loss": 0.0, + "num_tokens": 1804221.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 325.0, + "completions/mean_terminated_length": 325.0, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.03597122302158273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.0007711461621511262, + "learning_rate": 3.5760368663594476e-06, + "loss": 0.0, + "num_tokens": 1811261.0, + "reward": 1.3306450843811035, + "reward_std": 0.244304820895195, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3306451439857483, + "rewards/fixed_code_pass_all_test_reward/std": 0.2443048357963562, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 444.0, + "completions/mean_terminated_length": 444.0, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.036155690831949826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.0012634655431611463, + "learning_rate": 3.5944700460829495e-06, + "loss": 0.0001, + "num_tokens": 1819717.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 277.375, + "completions/mean_terminated_length": 277.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.03634015864231691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0018502646016713697, + "learning_rate": 3.6129032258064515e-06, + "loss": 0.0001, + "num_tokens": 1825000.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 462.75, + "completions/mean_terminated_length": 462.75, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.036524626452684006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.0008934969082474709, + "learning_rate": 3.6313364055299543e-06, + "loss": 0.0, + "num_tokens": 1834214.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 469.375, + "completions/mean_terminated_length": 469.375, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.0367090942630511, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.0009618061376386322, + "learning_rate": 3.6497695852534567e-06, + "loss": 0.0, + "num_tokens": 1842385.0, + "reward": 1.5367646217346191, + "reward_std": 0.3228602707386017, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6617646813392639, + "rewards/fixed_code_pass_all_test_reward/std": 0.20558202266693115, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 394.875, + "completions/mean_terminated_length": 394.875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.036893562073418186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "kl": 0.0004899756622762652, + "learning_rate": 3.6682027649769586e-06, + "loss": 0.0, + "num_tokens": 1849768.0, + "reward": 1.8571429252624512, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 989.625, + "completions/mean_terminated_length": 838.4285888671875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.03707802988378528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.0006023468413332012, + "learning_rate": 3.6866359447004615e-06, + "loss": 0.0, + "num_tokens": 1867349.0, + "reward": 0.6160714626312256, + "reward_std": 0.6958723664283752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2410714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.3705395758152008, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 404.375, + "completions/mean_terminated_length": 404.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.03726249769415237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.0007531879418820608, + "learning_rate": 3.7050691244239634e-06, + "loss": 0.0, + "num_tokens": 1876640.0, + "reward": 1.6812500953674316, + "reward_std": 0.3837665617465973, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8062499761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2786286771297455, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.03744696550451946, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.0011236622594879009, + "learning_rate": 3.7235023041474654e-06, + "loss": 0.0, + "num_tokens": 1881537.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1221.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 644.625, + "completions/mean_terminated_length": 644.625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.03763143331488655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.0008307326352223754, + "learning_rate": 3.741935483870968e-06, + "loss": 0.0, + "num_tokens": 1893966.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 417.875, + "completions/mean_terminated_length": 417.875, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.03781590112525364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.0008440261881332844, + "learning_rate": 3.76036866359447e-06, + "loss": 0.0, + "num_tokens": 1902429.0, + "reward": 1.0119047164916992, + "reward_std": 0.03367177024483681, + "rewards/fixed_code_pass_all_test_reward/mean": 0.011904762126505375, + "rewards/fixed_code_pass_all_test_reward/std": 0.033671751618385315, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 661.375, + "completions/mean_terminated_length": 661.375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.03800036893562073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.0005015727947466075, + "learning_rate": 3.7788018433179725e-06, + "loss": 0.0, + "num_tokens": 1914760.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 356.625, + "completions/mean_terminated_length": 356.625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.03818483674598783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.0009364754696434829, + "learning_rate": 3.797235023041475e-06, + "loss": 0.0, + "num_tokens": 1920421.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 403.625, + "completions/mean_terminated_length": 403.625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.03836930455635491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0128173828125, + "kl": 0.001008490078675095, + "learning_rate": 3.815668202764977e-06, + "loss": 0.0, + "num_tokens": 1929066.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 837.125, + "completions/mean_terminated_length": 664.1428833007812, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.03855377236672201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.0006416336327674799, + "learning_rate": 3.83410138248848e-06, + "loss": 0.0, + "num_tokens": 1939659.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 589.0, + "completions/mean_terminated_length": 589.0, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.0387382401770891, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.00039901621858007275, + "learning_rate": 3.852534562211982e-06, + "loss": 0.0, + "num_tokens": 1950787.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 260.125, + "completions/mean_terminated_length": 260.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.03892270798745619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.002401056233793497, + "learning_rate": 3.870967741935484e-06, + "loss": 0.0001, + "num_tokens": 1956708.0, + "reward": 1.149999976158142, + "reward_std": 0.6047431826591492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5249999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.13887302577495575, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 319.25, + "completions/mean_terminated_length": 319.25, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.03910717579782328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0014002888492541388, + "learning_rate": 3.889400921658986e-06, + "loss": 0.0001, + "num_tokens": 1963670.0, + "reward": 0.9711538553237915, + "reward_std": 0.8324946761131287, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3461538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.41526252031326294, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 365.75, + "completions/mean_terminated_length": 365.75, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.039291643608190374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.0014920804896974005, + "learning_rate": 3.907834101382489e-06, + "loss": 0.0001, + "num_tokens": 1970916.0, + "reward": 1.1745688915252686, + "reward_std": 0.06413999944925308, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17456898093223572, + "rewards/fixed_code_pass_all_test_reward/std": 0.06413999199867249, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 459.0, + "completions/mean_terminated_length": 459.0, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.03947611141855746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.703125, + "kl": 0.0003846293820970459, + "learning_rate": 3.926267281105991e-06, + "loss": 0.0, + "num_tokens": 1978796.0, + "reward": 1.774999976158142, + "reward_std": 0.4200340211391449, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1180.0, + "completions/max_terminated_length": 1180.0, + "completions/mean_length": 887.125, + "completions/mean_terminated_length": 887.125, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.039660579228924554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.0005514328586286865, + "learning_rate": 3.9447004608294935e-06, + "loss": 0.0, + "num_tokens": 1996381.0, + "reward": 0.8274999856948853, + "reward_std": 0.5537340641021729, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07750000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.21920311450958252, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.03984504703929164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0032268812356051058, + "learning_rate": 3.963133640552996e-06, + "loss": 0.0001, + "num_tokens": 2001982.0, + "reward": 1.15625, + "reward_std": 0.6365013122558594, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4943881630897522, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 448.375, + "completions/mean_terminated_length": 219.85714721679688, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.040029514849658734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.0007804899214534089, + "learning_rate": 3.981566820276498e-06, + "loss": 0.0, + "num_tokens": 2008481.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 426.25, + "completions/mean_terminated_length": 426.25, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.04021398266002583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.0016736947472963948, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0001, + "num_tokens": 2019787.0, + "reward": 1.685049057006836, + "reward_std": 0.45926016569137573, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8100490570068359, + "rewards/fixed_code_pass_all_test_reward/std": 0.37442171573638916, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 494.875, + "completions/mean_terminated_length": 494.875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.040398450470392915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.0005830442496517207, + "learning_rate": 4.018433179723503e-06, + "loss": 0.0, + "num_tokens": 2030442.0, + "reward": 1.6057692766189575, + "reward_std": 0.33765512704849243, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6057692170143127, + "rewards/fixed_code_pass_all_test_reward/std": 0.3376551568508148, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 641.375, + "completions/mean_terminated_length": 641.375, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.04058291828076001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.0011189756587555166, + "learning_rate": 4.036866359447005e-06, + "loss": 0.0, + "num_tokens": 2041941.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 350.75, + "completions/mean_terminated_length": 350.75, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.0407673860911271, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0011060474753321614, + "learning_rate": 4.055299539170508e-06, + "loss": 0.0, + "num_tokens": 2050891.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.04095185390149419, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0011242312903050333, + "learning_rate": 4.073732718894009e-06, + "loss": 0.0, + "num_tokens": 2055435.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 248.625, + "completions/mean_terminated_length": 248.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.04113632171186128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0007512669253628701, + "learning_rate": 4.092165898617512e-06, + "loss": 0.0, + "num_tokens": 2060448.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 246.25, + "completions/mean_terminated_length": 246.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.04132078952222837, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.001078595056242193, + "learning_rate": 4.110599078341014e-06, + "loss": 0.0, + "num_tokens": 2065170.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 457.75, + "completions/mean_terminated_length": 230.57144165039062, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.04150525733259546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.000705373466189485, + "learning_rate": 4.1290322580645165e-06, + "loss": 0.0, + "num_tokens": 2072344.0, + "reward": 1.2083332538604736, + "reward_std": 0.6943650841712952, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 436.0, + "completions/mean_terminated_length": 436.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.041689725142962555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0036429875399335288, + "learning_rate": 4.147465437788019e-06, + "loss": 0.0001, + "num_tokens": 2081048.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 553.75, + "completions/mean_terminated_length": 553.75, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.04187419295332964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.0008196749004127923, + "learning_rate": 4.165898617511521e-06, + "loss": 0.0, + "num_tokens": 2091966.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.042058660763696736, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.001910282313474454, + "learning_rate": 4.184331797235024e-06, + "loss": 0.0001, + "num_tokens": 2096362.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 459.5, + "completions/mean_terminated_length": 459.5, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.04224312857406383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.0005780758474429604, + "learning_rate": 4.202764976958525e-06, + "loss": 0.0, + "num_tokens": 2109766.0, + "reward": 1.9039256572723389, + "reward_std": 0.05909234285354614, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9039256572723389, + "rewards/fixed_code_pass_all_test_reward/std": 0.059092361479997635, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 464.625, + "completions/mean_terminated_length": 464.625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.042427596384430916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.0012948763542226516, + "learning_rate": 4.221198156682028e-06, + "loss": 0.0001, + "num_tokens": 2120027.0, + "reward": 1.3125, + "reward_std": 0.6512351036071777, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 643.375, + "completions/mean_terminated_length": 442.71429443359375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.04261206419479801, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.0007696978900639806, + "learning_rate": 4.23963133640553e-06, + "loss": 0.0, + "num_tokens": 2129558.0, + "reward": 0.8352272510528564, + "reward_std": 0.4413256049156189, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08522727340459824, + "rewards/fixed_code_pass_all_test_reward/std": 0.0985056608915329, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 561.25, + "completions/mean_terminated_length": 561.25, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.042796532005165096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.0009734336363180773, + "learning_rate": 4.258064516129032e-06, + "loss": 0.0, + "num_tokens": 2144648.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 480.75, + "completions/mean_terminated_length": 480.75, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.04298099981553219, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.0008978125115390867, + "learning_rate": 4.276497695852535e-06, + "loss": 0.0, + "num_tokens": 2155806.0, + "reward": 1.5431034564971924, + "reward_std": 0.4456154704093933, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6681034564971924, + "rewards/fixed_code_pass_all_test_reward/std": 0.26434552669525146, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 517.25, + "completions/mean_terminated_length": 517.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.04316546762589928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.001383633745717816, + "learning_rate": 4.294930875576037e-06, + "loss": 0.0001, + "num_tokens": 2168128.0, + "reward": 1.2123016119003296, + "reward_std": 0.6406171321868896, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4623015820980072, + "rewards/fixed_code_pass_all_test_reward/std": 0.4040108919143677, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.04334993543626637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.0009061945747816935, + "learning_rate": 4.3133640552995395e-06, + "loss": 0.0, + "num_tokens": 2173720.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 360.75, + "completions/mean_terminated_length": 360.75, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.04353440324663346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.00369867637346033, + "learning_rate": 4.331797235023042e-06, + "loss": 0.0001, + "num_tokens": 2180910.0, + "reward": 0.9937500357627869, + "reward_std": 0.6662461161613464, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4937499761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2555631101131439, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 736.625, + "completions/mean_terminated_length": 549.2857666015625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.043718871057000556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.0015223033951770049, + "learning_rate": 4.350230414746544e-06, + "loss": 0.0001, + "num_tokens": 2193475.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 150.625, + "completions/mean_terminated_length": 150.625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.04390333886736764, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.0016937976288318168, + "learning_rate": 4.368663594470047e-06, + "loss": 0.0001, + "num_tokens": 2197336.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 248.0, + "completions/mean_terminated_length": 248.0, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.04408780667773474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0010976589474012144, + "learning_rate": 4.387096774193549e-06, + "loss": 0.0, + "num_tokens": 2205640.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 517.125, + "completions/mean_terminated_length": 517.125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.04427227448810182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.00152742829232011, + "learning_rate": 4.405529953917051e-06, + "loss": 0.0001, + "num_tokens": 2214385.0, + "reward": 1.4545453786849976, + "reward_std": 0.481045663356781, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7045454382896423, + "rewards/fixed_code_pass_all_test_reward/std": 0.28645122051239014, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 325.0, + "completions/mean_terminated_length": 325.0, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.04445674229846892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.0015726226556580514, + "learning_rate": 4.423963133640554e-06, + "loss": 0.0001, + "num_tokens": 2221321.0, + "reward": 1.0509259700775146, + "reward_std": 0.027556447312235832, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05092592537403107, + "rewards/fixed_code_pass_all_test_reward/std": 0.027556437999010086, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 250.0, + "completions/mean_terminated_length": 250.0, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.04464121010883601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0009672840242274106, + "learning_rate": 4.442396313364056e-06, + "loss": 0.0, + "num_tokens": 2226841.0, + "reward": 1.3333333730697632, + "reward_std": 0.5634361505508423, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 379.875, + "completions/mean_terminated_length": 379.875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.0448256779192031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01397705078125, + "kl": 0.0014169239657348953, + "learning_rate": 4.460829493087558e-06, + "loss": 0.0001, + "num_tokens": 2237448.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 317.75, + "completions/mean_terminated_length": 317.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.04501014572957019, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.002318118786206469, + "learning_rate": 4.479262672811061e-06, + "loss": 0.0001, + "num_tokens": 2246310.0, + "reward": 1.6744791269302368, + "reward_std": 0.3847423791885376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6744791269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.3847424387931824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 190.875, + "completions/mean_terminated_length": 190.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.045194613539937284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0016356291307602078, + "learning_rate": 4.497695852534562e-06, + "loss": 0.0001, + "num_tokens": 2254373.0, + "reward": 1.5, + "reward_std": 0.483615517616272, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.48361557722091675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 380.625, + "completions/mean_terminated_length": 380.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.04537908135030437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.0018660046262084506, + "learning_rate": 4.516129032258065e-06, + "loss": 0.0001, + "num_tokens": 2264386.0, + "reward": 1.3035714626312256, + "reward_std": 0.46721991896629333, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.22908106446266174, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 305.5, + "completions/mean_terminated_length": 305.5, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.045563549160671464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.005900959156861063, + "learning_rate": 4.534562211981567e-06, + "loss": 0.0002, + "num_tokens": 2273526.0, + "reward": 1.1875, + "reward_std": 0.5303300619125366, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 292.0, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.04574801697103855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.0023254387742781546, + "learning_rate": 4.5529953917050696e-06, + "loss": 0.0001, + "num_tokens": 2279606.0, + "reward": 1.6875, + "reward_std": 0.43129098415374756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.43129095435142517, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.045932484781405644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.004013640558696352, + "learning_rate": 4.571428571428572e-06, + "loss": 0.0002, + "num_tokens": 2285673.0, + "reward": 1.7421875, + "reward_std": 0.27637138962745667, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7421875, + "rewards/fixed_code_pass_all_test_reward/std": 0.27637138962745667, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1881.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 873.375, + "completions/mean_terminated_length": 873.375, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 0.04611695259177274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019775390625, + "kl": 0.001434162666555494, + "learning_rate": 4.589861751152074e-06, + "loss": 0.0001, + "num_tokens": 2299244.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 376.0, + "completions/mean_terminated_length": 376.0, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.046301420402139824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01007080078125, + "kl": 0.0007213067365228198, + "learning_rate": 4.608294930875577e-06, + "loss": 0.0, + "num_tokens": 2306348.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 144.625, + "completions/mean_terminated_length": 144.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.04648588821250692, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.0020470824019866996, + "learning_rate": 4.626728110599078e-06, + "loss": 0.0001, + "num_tokens": 2310249.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 127.0, + "completions/mean_terminated_length": 127.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.04667035602287401, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "kl": 0.004192173364572227, + "learning_rate": 4.6451612903225815e-06, + "loss": 0.0002, + "num_tokens": 2313993.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 549.0, + "completions/mean_terminated_length": 549.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.0468548238332411, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.0034435808920534328, + "learning_rate": 4.663594470046083e-06, + "loss": 0.0001, + "num_tokens": 2323729.0, + "reward": 0.8857142925262451, + "reward_std": 0.35913729667663574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.010714286006987095, + "rewards/fixed_code_pass_all_test_reward/std": 0.030304577201604843, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 187.75, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.04703929164360819, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0010774861366371624, + "learning_rate": 4.682027649769585e-06, + "loss": 0.0, + "num_tokens": 2328263.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 492.5, + "completions/mean_terminated_length": 492.5, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.04722375945397528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.0019477826026559342, + "learning_rate": 4.700460829493088e-06, + "loss": 0.0001, + "num_tokens": 2336891.0, + "reward": 1.1500000953674316, + "reward_std": 0.29760950803756714, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, + "rewards/fixed_code_pass_all_test_reward/std": 0.29760950803756714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 422.5, + "completions/mean_terminated_length": 422.5, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.04740822726434237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.0012998752063140273, + "learning_rate": 4.71889400921659e-06, + "loss": 0.0001, + "num_tokens": 2345839.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 420.125, + "completions/mean_terminated_length": 420.125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.047592695074709465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0272216796875, + "kl": 0.002006904782319907, + "learning_rate": 4.7373271889400925e-06, + "loss": 0.0001, + "num_tokens": 2355136.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.04777716288507655, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.0020117964450037107, + "learning_rate": 4.755760368663595e-06, + "loss": 0.0001, + "num_tokens": 2359440.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 146.125, + "completions/mean_terminated_length": 146.125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.047961630695443645, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.003070744496653788, + "learning_rate": 4.774193548387097e-06, + "loss": 0.0001, + "num_tokens": 2363433.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 477.0, + "completions/mean_terminated_length": 477.0, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.04814609850581074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8203125, + "kl": 0.0019301101892779116, + "learning_rate": 4.7926267281106e-06, + "loss": 0.0001, + "num_tokens": 2372801.0, + "reward": 1.9285714626312256, + "reward_std": 0.15272073447704315, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.15272070467472076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 478.25, + "completions/mean_terminated_length": 478.25, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.048330566316177825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.001288857627514517, + "learning_rate": 4.811059907834102e-06, + "loss": 0.0001, + "num_tokens": 2384587.0, + "reward": 1.6130952835083008, + "reward_std": 0.4839085340499878, + "rewards/fixed_code_pass_all_test_reward/mean": 0.613095223903656, + "rewards/fixed_code_pass_all_test_reward/std": 0.4839085638523102, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 230.0, + "completions/mean_terminated_length": 230.0, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.04851503412654492, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.002426694249152206, + "learning_rate": 4.8294930875576044e-06, + "loss": 0.0001, + "num_tokens": 2392219.0, + "reward": 1.946874976158142, + "reward_std": 0.15026018023490906, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9468749761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.15026019513607025, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 327.25, + "completions/mean_terminated_length": 327.25, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.048699501936912006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.0017713702836772427, + "learning_rate": 4.847926267281106e-06, + "loss": 0.0001, + "num_tokens": 2401813.0, + "reward": 1.277438998222351, + "reward_std": 0.077059805393219, + "rewards/fixed_code_pass_all_test_reward/mean": 0.27743902802467346, + "rewards/fixed_code_pass_all_test_reward/std": 0.07705983519554138, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 500.75, + "completions/mean_terminated_length": 279.71429443359375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.0488839697472791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71484375, + "kl": 0.002141762764949817, + "learning_rate": 4.866359447004609e-06, + "loss": 0.0001, + "num_tokens": 2409083.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 480.375, + "completions/mean_terminated_length": 480.375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.04906843755764619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.0015990414540283382, + "learning_rate": 4.884792626728111e-06, + "loss": 0.0001, + "num_tokens": 2417918.0, + "reward": 1.3392857313156128, + "reward_std": 0.5503113269805908, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.21257823705673218, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 460.875, + "completions/mean_terminated_length": 460.875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.04925290536801328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.00690473157010274, + "learning_rate": 4.903225806451613e-06, + "loss": 0.0003, + "num_tokens": 2430333.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1317.0, + "completions/max_terminated_length": 1317.0, + "completions/mean_length": 551.875, + "completions/mean_terminated_length": 551.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.04943737317838037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.0015725677512818947, + "learning_rate": 4.9216589861751155e-06, + "loss": 0.0001, + "num_tokens": 2442668.0, + "reward": 1.019230842590332, + "reward_std": 0.05439284071326256, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01923076994717121, + "rewards/fixed_code_pass_all_test_reward/std": 0.05439283326268196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 268.625, + "completions/mean_terminated_length": 268.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.049621840988747466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.0010252786814817227, + "learning_rate": 4.940092165898618e-06, + "loss": 0.0, + "num_tokens": 2448225.0, + "reward": 1.7142856121063232, + "reward_std": 0.42515644431114197, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.31886735558509827, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 298.125, + "completions/mean_terminated_length": 298.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.04980630879911455, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018798828125, + "kl": 0.0011696503061102703, + "learning_rate": 4.95852534562212e-06, + "loss": 0.0, + "num_tokens": 2454034.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 413.625, + "completions/mean_terminated_length": 413.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.049990776609481646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.003565599057765212, + "learning_rate": 4.976958525345623e-06, + "loss": 0.0001, + "num_tokens": 2462727.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 495.75, + "completions/mean_terminated_length": 495.75, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.05017524441984873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0027229955594521016, + "learning_rate": 4.995391705069125e-06, + "loss": 0.0001, + "num_tokens": 2473381.0, + "reward": 1.4375, + "reward_std": 0.3204349875450134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 407.375, + "completions/mean_terminated_length": 407.375, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.050359712230215826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.81640625, + "kl": 0.004993542192096356, + "learning_rate": 5.013824884792627e-06, + "loss": 0.0002, + "num_tokens": 2481512.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 386.125, + "completions/mean_terminated_length": 386.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.05054418004058292, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.0013789842996629886, + "learning_rate": 5.032258064516129e-06, + "loss": 0.0001, + "num_tokens": 2491793.0, + "reward": 1.2000000476837158, + "reward_std": 0.21380895376205444, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.21380901336669922, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 679.25, + "completions/mean_terminated_length": 679.25, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.05072864785095001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.002456085945595987, + "learning_rate": 5.050691244239631e-06, + "loss": 0.0001, + "num_tokens": 2503851.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 297.0, + "completions/mean_terminated_length": 297.0, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.0509131156613171, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.0026333942369092256, + "learning_rate": 5.0691244239631346e-06, + "loss": 0.0001, + "num_tokens": 2510483.0, + "reward": 1.25, + "reward_std": 0.0609799362719059, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.060979947447776794, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1895.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 825.75, + "completions/mean_terminated_length": 825.75, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.051097583471684194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.486328125, + "kl": 0.0008318830896314466, + "learning_rate": 5.087557603686636e-06, + "loss": 0.0, + "num_tokens": 2524321.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 424.625, + "completions/mean_terminated_length": 424.625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.05128205128205128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.002531564299715683, + "learning_rate": 5.1059907834101385e-06, + "loss": 0.0001, + "num_tokens": 2534046.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 422.375, + "completions/mean_terminated_length": 190.1428680419922, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.051466519092418374, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.0036303385058999993, + "learning_rate": 5.124423963133641e-06, + "loss": 0.0001, + "num_tokens": 2540289.0, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 704.0, + "completions/mean_terminated_length": 256.0, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.05165098690278547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.0036192442967148963, + "learning_rate": 5.142857142857142e-06, + "loss": 0.0001, + "num_tokens": 2555801.0, + "reward": 0.855769157409668, + "reward_std": 0.7256180047988892, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23076923191547394, + "rewards/fixed_code_pass_all_test_reward/std": 0.24670268595218658, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 157.875, + "completions/mean_terminated_length": 157.875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.051835454713152554, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.0028796335827792063, + "learning_rate": 5.161290322580646e-06, + "loss": 0.0001, + "num_tokens": 2559824.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 121.75, + "completions/mean_terminated_length": 121.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.05201992252351965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0028399306756909937, + "learning_rate": 5.179723502304148e-06, + "loss": 0.0001, + "num_tokens": 2563606.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 407.5, + "completions/mean_terminated_length": 407.5, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.052204390333886734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0128173828125, + "kl": 0.0011331468558637425, + "learning_rate": 5.1981566820276495e-06, + "loss": 0.0, + "num_tokens": 2575122.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 402.0, + "completions/mean_terminated_length": 402.0, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.05238885814425383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.0017614895186852664, + "learning_rate": 5.216589861751153e-06, + "loss": 0.0001, + "num_tokens": 2583034.0, + "reward": 1.5, + "reward_std": 0.35040876269340515, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 515.875, + "completions/mean_terminated_length": 515.875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.05257332595462092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.0010412275369162671, + "learning_rate": 5.235023041474655e-06, + "loss": 0.0, + "num_tokens": 2596281.0, + "reward": 1.1646342277526855, + "reward_std": 0.8171641826629639, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4146341383457184, + "rewards/fixed_code_pass_all_test_reward/std": 0.4653362035751343, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 336.0, + "completions/mean_terminated_length": 336.0, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.05275779376498801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.001470310686272569, + "learning_rate": 5.253456221198157e-06, + "loss": 0.0001, + "num_tokens": 2601865.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 206.25, + "completions/mean_terminated_length": 206.25, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.0529422615753551, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.002124873222783208, + "learning_rate": 5.271889400921659e-06, + "loss": 0.0001, + "num_tokens": 2607035.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 163.0, + "completions/mean_terminated_length": 163.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.053126729385722195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029052734375, + "kl": 0.002531195234041661, + "learning_rate": 5.290322580645162e-06, + "loss": 0.0001, + "num_tokens": 2611099.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 157.25, + "completions/mean_terminated_length": 157.25, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.05331119719608928, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035888671875, + "kl": 0.0032399199990322813, + "learning_rate": 5.308755760368665e-06, + "loss": 0.0001, + "num_tokens": 2615141.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 464.375, + "completions/mean_terminated_length": 464.375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.053495665006456375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.0015845611487748101, + "learning_rate": 5.327188940092166e-06, + "loss": 0.0001, + "num_tokens": 2623776.0, + "reward": 1.4642857313156128, + "reward_std": 0.28656336665153503, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.28656336665153503, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 248.5, + "completions/mean_terminated_length": 248.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.05368013281682346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0026175471139140427, + "learning_rate": 5.345622119815669e-06, + "loss": 0.0001, + "num_tokens": 2628612.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 144.0, + "completions/mean_terminated_length": 144.0, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.053864600627190555, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.703125, + "kl": 0.008950897346949205, + "learning_rate": 5.364055299539172e-06, + "loss": 0.0004, + "num_tokens": 2632540.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1710.0, + "completions/mean_length": 748.125, + "completions/mean_terminated_length": 562.4285888671875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.05404906843755765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.0016333454186678864, + "learning_rate": 5.382488479262673e-06, + "loss": 0.0001, + "num_tokens": 2646797.0, + "reward": 1.3562500476837158, + "reward_std": 0.7317047715187073, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6062500476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.2982671558856964, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 500.125, + "completions/mean_terminated_length": 279.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.054233536247924735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.0013850810464646202, + "learning_rate": 5.400921658986176e-06, + "loss": 0.0001, + "num_tokens": 2656870.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.05441800405829183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.003704235816258006, + "learning_rate": 5.419354838709678e-06, + "loss": 0.0001, + "num_tokens": 2666049.0, + "reward": 1.442460298538208, + "reward_std": 0.48742881417274475, + "rewards/fixed_code_pass_all_test_reward/mean": 0.442460298538208, + "rewards/fixed_code_pass_all_test_reward/std": 0.48742881417274475, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 269.0, + "completions/mean_terminated_length": 269.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.05460247186865892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.0048446366272401065, + "learning_rate": 5.43778801843318e-06, + "loss": 0.0002, + "num_tokens": 2672257.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 463.75, + "completions/mean_terminated_length": 463.75, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.05478693967902601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.0022193394688656554, + "learning_rate": 5.456221198156683e-06, + "loss": 0.0001, + "num_tokens": 2681255.0, + "reward": 1.5957791805267334, + "reward_std": 0.1994423270225525, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5957792401313782, + "rewards/fixed_code_pass_all_test_reward/std": 0.19944234192371368, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 264.875, + "completions/mean_terminated_length": 264.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.0549714074893931, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.0016861603508004919, + "learning_rate": 5.474654377880185e-06, + "loss": 0.0001, + "num_tokens": 2686894.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 203.625, + "completions/mean_terminated_length": 203.625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.05515587529976019, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.0034283193526789546, + "learning_rate": 5.493087557603687e-06, + "loss": 0.0001, + "num_tokens": 2693891.0, + "reward": 1.6574074029922485, + "reward_std": 0.41373157501220703, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7824074029922485, + "rewards/fixed_code_pass_all_test_reward/std": 0.32915517687797546, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 314.5, + "completions/mean_terminated_length": 314.5, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.05534034311012728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023193359375, + "kl": 0.002563972091593314, + "learning_rate": 5.511520737327189e-06, + "loss": 0.0001, + "num_tokens": 2699527.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 381.875, + "completions/mean_terminated_length": 381.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.055524810920494376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.0031164427782641724, + "learning_rate": 5.529953917050692e-06, + "loss": 0.0001, + "num_tokens": 2710078.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 361.5, + "completions/mean_terminated_length": 361.5, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.05570927873086146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.0028207486757310107, + "learning_rate": 5.548387096774194e-06, + "loss": 0.0001, + "num_tokens": 2719122.0, + "reward": 1.21484375, + "reward_std": 0.4048525094985962, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21484375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4048525094985962, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.055893746541228556, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.005300380915286951, + "learning_rate": 5.566820276497696e-06, + "loss": 0.0002, + "num_tokens": 2728234.0, + "reward": 1.021276593208313, + "reward_std": 0.7291032671928406, + "rewards/fixed_code_pass_all_test_reward/mean": 0.271276593208313, + "rewards/fixed_code_pass_all_test_reward/std": 0.4525967538356781, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 368.125, + "completions/mean_terminated_length": 368.125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.05607821435159565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.003201348183210939, + "learning_rate": 5.585253456221199e-06, + "loss": 0.0001, + "num_tokens": 2734163.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 585.25, + "completions/mean_terminated_length": 585.25, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.056262682161962736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.0038694553077220917, + "learning_rate": 5.6036866359447e-06, + "loss": 0.0002, + "num_tokens": 2746677.0, + "reward": 0.692307710647583, + "reward_std": 0.6855592727661133, + "rewards/fixed_code_pass_all_test_reward/mean": 0.19230769574642181, + "rewards/fixed_code_pass_all_test_reward/std": 0.22893041372299194, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 603.75, + "completions/mean_terminated_length": 603.75, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.05644714997232983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.002914172764576506, + "learning_rate": 5.6221198156682035e-06, + "loss": 0.0001, + "num_tokens": 2758867.0, + "reward": 0.7434210777282715, + "reward_std": 0.4719580113887787, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, + "rewards/fixed_code_pass_all_test_reward/std": 0.1839417964220047, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 263.5, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.056631617782696916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02587890625, + "kl": 0.002347299494431354, + "learning_rate": 5.640552995391706e-06, + "loss": 0.0001, + "num_tokens": 2768447.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 293.375, + "completions/mean_terminated_length": 293.375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.05681608559306401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.004283955306163989, + "learning_rate": 5.658986175115207e-06, + "loss": 0.0002, + "num_tokens": 2775082.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 222.0, + "completions/mean_terminated_length": 222.0, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.0570005534034311, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.003466296271653846, + "learning_rate": 5.677419354838711e-06, + "loss": 0.0001, + "num_tokens": 2779626.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 113.75, + "completions/mean_terminated_length": 113.75, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.05718502121379819, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.004007494033430703, + "learning_rate": 5.695852534562213e-06, + "loss": 0.0002, + "num_tokens": 2783392.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 509.75, + "completions/mean_terminated_length": 509.75, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.057369489024165284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.0033406703441869467, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.0001, + "num_tokens": 2792878.0, + "reward": 1.3725961446762085, + "reward_std": 0.30853894352912903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3725961446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.30853894352912903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 320.875, + "completions/mean_terminated_length": 320.875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.05755395683453238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0027820689138025045, + "learning_rate": 5.732718894009217e-06, + "loss": 0.0001, + "num_tokens": 2801829.0, + "reward": 1.274999976158142, + "reward_std": 0.4527692198753357, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.45276927947998047, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 413.625, + "completions/mean_terminated_length": 413.625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.057738424644899464, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.86328125, + "kl": 0.0015533613477600738, + "learning_rate": 5.75115207373272e-06, + "loss": 0.0001, + "num_tokens": 2809290.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 668.125, + "completions/mean_terminated_length": 668.125, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.05792289245526656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.00535172458330635, + "learning_rate": 5.769585253456222e-06, + "loss": 0.0002, + "num_tokens": 2821251.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 202.5, + "completions/mean_terminated_length": 202.5, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.058107360265633644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0308837890625, + "kl": 0.0026180390486842953, + "learning_rate": 5.788018433179724e-06, + "loss": 0.0001, + "num_tokens": 2826263.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 589.125, + "completions/mean_terminated_length": 589.125, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.05829182807600074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.00468234814616153, + "learning_rate": 5.806451612903226e-06, + "loss": 0.0002, + "num_tokens": 2835680.0, + "reward": 1.4485294818878174, + "reward_std": 0.5965276956558228, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5735294222831726, + "rewards/fixed_code_pass_all_test_reward/std": 0.3663422167301178, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 333.875, + "completions/mean_terminated_length": 333.875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.05847629588636783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.0038610458141192794, + "learning_rate": 5.824884792626728e-06, + "loss": 0.0002, + "num_tokens": 2842879.0, + "reward": 1.0592105388641357, + "reward_std": 0.46092891693115234, + "rewards/fixed_code_pass_all_test_reward/mean": 0.18421052396297455, + "rewards/fixed_code_pass_all_test_reward/std": 0.18661163747310638, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 147.875, + "completions/mean_terminated_length": 147.875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.05866076369673492, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "kl": 0.00386939563031774, + "learning_rate": 5.843317972350231e-06, + "loss": 0.0002, + "num_tokens": 2847086.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 227.125, + "completions/mean_terminated_length": 227.125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.05884523150710201, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.003912389889592305, + "learning_rate": 5.8617511520737336e-06, + "loss": 0.0002, + "num_tokens": 2853679.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.059029699317469105, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.0047894025628920645, + "learning_rate": 5.880184331797235e-06, + "loss": 0.0002, + "num_tokens": 2861231.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 769.625, + "completions/mean_terminated_length": 587.0, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.05921416712783619, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.416015625, + "kl": 0.001476364897825988, + "learning_rate": 5.8986175115207375e-06, + "loss": 0.0001, + "num_tokens": 2876108.0, + "reward": 1.4777777194976807, + "reward_std": 0.6443897485733032, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6027777791023254, + "rewards/fixed_code_pass_all_test_reward/std": 0.343534380197525, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 387.625, + "completions/mean_terminated_length": 387.625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.059398634938203285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.004371067494503222, + "learning_rate": 5.917050691244241e-06, + "loss": 0.0002, + "num_tokens": 2885529.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 384.5, + "completions/mean_terminated_length": 146.85714721679688, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.05958310274857037, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.003469252842478454, + "learning_rate": 5.935483870967742e-06, + "loss": 0.0001, + "num_tokens": 2891517.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 268.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.059767570558937465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.004643242893507704, + "learning_rate": 5.953917050691245e-06, + "loss": 0.0002, + "num_tokens": 2901291.0, + "reward": 1.1428570747375488, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, + "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 528.75, + "completions/mean_terminated_length": 528.75, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.05995203836930456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.002668224580702372, + "learning_rate": 5.972350230414747e-06, + "loss": 0.0001, + "num_tokens": 2914641.0, + "reward": 1.0833333730697632, + "reward_std": 0.2357023060321808, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022911310196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.060136506179671645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.008318491956742946, + "learning_rate": 5.9907834101382485e-06, + "loss": 0.0003, + "num_tokens": 2918789.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 319.5, + "completions/mean_terminated_length": 319.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.06032097399003874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.0056224293512059376, + "learning_rate": 6.009216589861752e-06, + "loss": 0.0002, + "num_tokens": 2925697.0, + "reward": 1.3863636255264282, + "reward_std": 0.34274399280548096, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3863636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.34274399280548096, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 252.125, + "completions/mean_terminated_length": 252.125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.06050544180040583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.0036366573767736554, + "learning_rate": 6.027649769585254e-06, + "loss": 0.0001, + "num_tokens": 2933858.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.06068990961077292, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.005450486583868042, + "learning_rate": 6.046082949308756e-06, + "loss": 0.0002, + "num_tokens": 2944551.0, + "reward": 1.3977272510528564, + "reward_std": 0.5001475811004639, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39772725105285645, + "rewards/fixed_code_pass_all_test_reward/std": 0.5001475811004639, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 285.375, + "completions/mean_terminated_length": 285.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.06087437742114001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.004170244283159263, + "learning_rate": 6.064516129032259e-06, + "loss": 0.0002, + "num_tokens": 2954682.0, + "reward": 1.813636302947998, + "reward_std": 0.12206411361694336, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8136363625526428, + "rewards/fixed_code_pass_all_test_reward/std": 0.12206411361694336, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 270.875, + "completions/mean_terminated_length": 270.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.0610588452315071, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.008031944889808074, + "learning_rate": 6.082949308755761e-06, + "loss": 0.0003, + "num_tokens": 2960441.0, + "reward": 1.6750000715255737, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6749999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535534143447876, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.06124331304187419, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.007419643545290455, + "learning_rate": 6.101382488479263e-06, + "loss": 0.0003, + "num_tokens": 2970032.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 267.875, + "completions/mean_terminated_length": 267.875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.061427780852241286, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.005558678589295596, + "learning_rate": 6.119815668202765e-06, + "loss": 0.0002, + "num_tokens": 2976407.0, + "reward": 1.1590909957885742, + "reward_std": 0.32867667078971863, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15909090638160706, + "rewards/fixed_code_pass_all_test_reward/std": 0.32867664098739624, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1328.0, + "completions/max_terminated_length": 1328.0, + "completions/mean_length": 1174.75, + "completions/mean_terminated_length": 1174.75, + "completions/min_length": 985.0, + "completions/min_terminated_length": 985.0, + "epoch": 0.06161224866260837, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.453125, + "kl": 0.000754260050598532, + "learning_rate": 6.1382488479262684e-06, + "loss": 0.0, + "num_tokens": 3003333.0, + "reward": 1.6739130020141602, + "reward_std": 0.3109314739704132, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6739130020141602, + "rewards/fixed_code_pass_all_test_reward/std": 0.3109314441680908, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 325.0, + "completions/mean_terminated_length": 325.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.061796716472975466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.00738844892475754, + "learning_rate": 6.15668202764977e-06, + "loss": 0.0003, + "num_tokens": 3010109.0, + "reward": 0.7749999761581421, + "reward_std": 0.48080289363861084, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, + "rewards/fixed_code_pass_all_test_reward/std": 0.05099019780755043, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 503.0, + "completions/mean_terminated_length": 503.0, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.06198118428334256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.003005138802109286, + "learning_rate": 6.175115207373272e-06, + "loss": 0.0001, + "num_tokens": 3024045.0, + "reward": 1.3011362552642822, + "reward_std": 0.2535898983478546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3011363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.253589928150177, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 120.0, + "completions/mean_terminated_length": 120.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.062165652093709646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.010897418716922402, + "learning_rate": 6.193548387096775e-06, + "loss": 0.0004, + "num_tokens": 3027725.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 290.625, + "completions/mean_terminated_length": 290.625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.06235011990407674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.0038318773731589317, + "learning_rate": 6.211981566820276e-06, + "loss": 0.0002, + "num_tokens": 3037226.0, + "reward": 1.0, + "reward_std": 0.623354971408844, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.21380899846553802, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 276.625, + "completions/mean_terminated_length": 276.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.06253458771444383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.005482116554048844, + "learning_rate": 6.2304147465437795e-06, + "loss": 0.0002, + "num_tokens": 3046671.0, + "reward": 1.6324257850646973, + "reward_std": 0.5073834657669067, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6324257850646973, + "rewards/fixed_code_pass_all_test_reward/std": 0.5073834657669067, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 386.75, + "completions/mean_terminated_length": 386.75, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.06271905552481093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.004501984847593121, + "learning_rate": 6.248847926267282e-06, + "loss": 0.0002, + "num_tokens": 3054717.0, + "reward": 1.3203125, + "reward_std": 0.37565046548843384, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4453125, + "rewards/fixed_code_pass_all_test_reward/std": 0.2227003127336502, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 290.5, + "completions/mean_terminated_length": 290.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.06290352333517801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.003907048871042207, + "learning_rate": 6.267281105990783e-06, + "loss": 0.0002, + "num_tokens": 3066745.0, + "reward": 1.6853448152542114, + "reward_std": 0.28281721472740173, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6853448152542114, + "rewards/fixed_code_pass_all_test_reward/std": 0.28281718492507935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 547.875, + "completions/mean_terminated_length": 547.875, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.0630879911455451, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.005283918377244845, + "learning_rate": 6.285714285714286e-06, + "loss": 0.0002, + "num_tokens": 3081608.0, + "reward": 0.7791666984558105, + "reward_std": 0.4830254018306732, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.04859127476811409, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 191.875, + "completions/mean_terminated_length": 191.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.06327245895591219, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.00352828815812245, + "learning_rate": 6.304147465437789e-06, + "loss": 0.0001, + "num_tokens": 3086447.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 543.25, + "completions/mean_terminated_length": 543.25, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.06345692676627929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0223388671875, + "kl": 0.0029360177359194495, + "learning_rate": 6.3225806451612906e-06, + "loss": 0.0001, + "num_tokens": 3096713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 308.625, + "completions/mean_terminated_length": 308.625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.06364139457664637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.004165337551967241, + "learning_rate": 6.341013824884793e-06, + "loss": 0.0002, + "num_tokens": 3103606.0, + "reward": 1.5833332538604736, + "reward_std": 0.4671414792537689, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.31405800580978394, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 245.25, + "completions/mean_terminated_length": 245.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.06382586238701346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.004217600362608209, + "learning_rate": 6.359447004608295e-06, + "loss": 0.0002, + "num_tokens": 3111296.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 284.25, + "completions/mean_terminated_length": 284.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.06401033019738056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.00600789362215437, + "learning_rate": 6.377880184331797e-06, + "loss": 0.0002, + "num_tokens": 3119138.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 430.25, + "completions/mean_terminated_length": 430.25, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.06419479800774765, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.003173600045556668, + "learning_rate": 6.3963133640553e-06, + "loss": 0.0001, + "num_tokens": 3128404.0, + "reward": 1.34375, + "reward_std": 0.4614343047142029, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4416610598564148, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1215.0, + "completions/max_terminated_length": 1215.0, + "completions/mean_length": 826.875, + "completions/mean_terminated_length": 826.875, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "epoch": 0.06437926581811473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.004105518906726502, + "learning_rate": 6.4147465437788025e-06, + "loss": 0.0002, + "num_tokens": 3144963.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 395.625, + "completions/mean_terminated_length": 395.625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.06456373362848183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.006416238553356379, + "learning_rate": 6.433179723502304e-06, + "loss": 0.0003, + "num_tokens": 3153064.0, + "reward": 1.0972223281860352, + "reward_std": 0.03928373008966446, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0972222238779068, + "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 192.875, + "completions/mean_terminated_length": 192.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.06474820143884892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.004348761518485844, + "learning_rate": 6.451612903225806e-06, + "loss": 0.0002, + "num_tokens": 3157559.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 470.375, + "completions/mean_terminated_length": 245.00001525878906, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06493266924921601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.006047164803021587, + "learning_rate": 6.47004608294931e-06, + "loss": 0.0002, + "num_tokens": 3166946.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 200.25, + "completions/mean_terminated_length": 200.25, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.06511713705958311, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.020688183431047946, + "learning_rate": 6.488479262672812e-06, + "loss": 0.0008, + "num_tokens": 3172228.0, + "reward": 1.6136363744735718, + "reward_std": 0.33313649892807007, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.33313652873039246, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 185.25, + "completions/mean_terminated_length": 185.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.0653016048699502, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.0035269658983452246, + "learning_rate": 6.5069124423963135e-06, + "loss": 0.0001, + "num_tokens": 3177238.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 196.75, + "completions/mean_terminated_length": 196.75, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.06548607268031728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.005910440318984911, + "learning_rate": 6.525345622119817e-06, + "loss": 0.0002, + "num_tokens": 3181868.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 359.25, + "completions/mean_terminated_length": 359.25, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.06567054049068438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.00930082905688323, + "learning_rate": 6.543778801843319e-06, + "loss": 0.0004, + "num_tokens": 3192702.0, + "reward": 1.4500000476837158, + "reward_std": 0.47207745909690857, + "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, + "rewards/fixed_code_pass_all_test_reward/std": 0.47207748889923096, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 321.875, + "completions/mean_terminated_length": 321.875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.06585500830105147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.00542910625517834, + "learning_rate": 6.562211981566821e-06, + "loss": 0.0002, + "num_tokens": 3201861.0, + "reward": 1.392045497894287, + "reward_std": 0.5055404305458069, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39204543828964233, + "rewards/fixed_code_pass_all_test_reward/std": 0.5055404305458069, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 417.25, + "completions/mean_terminated_length": 417.25, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.06603947611141855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.006025874783517793, + "learning_rate": 6.580645161290323e-06, + "loss": 0.0002, + "num_tokens": 3210375.0, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 388.875, + "completions/mean_terminated_length": 388.875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.06622394392178566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.0059027707611676306, + "learning_rate": 6.599078341013826e-06, + "loss": 0.0002, + "num_tokens": 3220742.0, + "reward": 1.7857142686843872, + "reward_std": 0.5060566067695618, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.16121803224086761, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 218.75, + "completions/mean_terminated_length": 218.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.06640841173215274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.00841543628484942, + "learning_rate": 6.617511520737328e-06, + "loss": 0.0003, + "num_tokens": 3228940.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 556.0, + "completions/mean_terminated_length": 556.0, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.06659287954251983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.003931356506654993, + "learning_rate": 6.63594470046083e-06, + "loss": 0.0002, + "num_tokens": 3245828.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 224.0, + "completions/mean_terminated_length": 224.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.06677734735288691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.011652858345769346, + "learning_rate": 6.6543778801843326e-06, + "loss": 0.0005, + "num_tokens": 3250540.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 267.375, + "completions/mean_terminated_length": 267.375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.06696181516325402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.005880098877241835, + "learning_rate": 6.672811059907834e-06, + "loss": 0.0002, + "num_tokens": 3258895.0, + "reward": 1.904761791229248, + "reward_std": 0.154827281832695, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.1548272967338562, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 474.5, + "completions/mean_terminated_length": 474.5, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.0671462829736211, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0142822265625, + "kl": 0.0012356218394415919, + "learning_rate": 6.691244239631337e-06, + "loss": 0.0, + "num_tokens": 3267659.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 217.875, + "completions/mean_terminated_length": 217.875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.06733075078398819, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.008868088538292795, + "learning_rate": 6.70967741935484e-06, + "loss": 0.0004, + "num_tokens": 3276162.0, + "reward": 1.3900861740112305, + "reward_std": 0.7316022515296936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6400862336158752, + "rewards/fixed_code_pass_all_test_reward/std": 0.42055612802505493, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 281.75, + "completions/mean_terminated_length": 281.75, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.06751521859435529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.008630494150565937, + "learning_rate": 6.728110599078341e-06, + "loss": 0.0003, + "num_tokens": 3282464.0, + "reward": 1.1541666984558105, + "reward_std": 0.34592297673225403, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15416666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.34592294692993164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 233.875, + "completions/mean_terminated_length": 233.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.06769968640472238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.0059770430088974535, + "learning_rate": 6.746543778801844e-06, + "loss": 0.0002, + "num_tokens": 3290935.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 388.0, + "completions/mean_terminated_length": 388.0, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.06788415421508946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.005427137948572636, + "learning_rate": 6.764976958525347e-06, + "loss": 0.0002, + "num_tokens": 3298911.0, + "reward": 1.5625, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 443.125, + "completions/mean_terminated_length": 443.125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.06806862202545656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.004619544371962547, + "learning_rate": 6.783410138248848e-06, + "loss": 0.0002, + "num_tokens": 3312248.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.06825308983582365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.00374437149730511, + "learning_rate": 6.801843317972351e-06, + "loss": 0.0001, + "num_tokens": 3316453.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 109.875, + "completions/mean_terminated_length": 109.875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.06843755764619074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.00852827716153115, + "learning_rate": 6.820276497695853e-06, + "loss": 0.0003, + "num_tokens": 3320212.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 298.625, + "completions/mean_terminated_length": 298.625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.06862202545655784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.006679936865111813, + "learning_rate": 6.838709677419355e-06, + "loss": 0.0003, + "num_tokens": 3326369.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 173.875, + "completions/mean_terminated_length": 173.875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.06880649326692492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.003922005329513922, + "learning_rate": 6.857142857142858e-06, + "loss": 0.0002, + "num_tokens": 3330752.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 354.625, + "completions/mean_terminated_length": 354.625, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.06899096107729201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.01255025013233535, + "learning_rate": 6.87557603686636e-06, + "loss": 0.0005, + "num_tokens": 3340221.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.06917542888765911, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0056608308223076165, + "learning_rate": 6.894009216589862e-06, + "loss": 0.0002, + "num_tokens": 3345115.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 645.0, + "completions/mean_terminated_length": 444.5714416503906, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.0693598966980262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.004642765736207366, + "learning_rate": 6.912442396313365e-06, + "loss": 0.0002, + "num_tokens": 3356483.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 523.625, + "completions/mean_terminated_length": 523.625, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.06954436450839328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.003307518913061358, + "learning_rate": 6.9308755760368674e-06, + "loss": 0.0001, + "num_tokens": 3369144.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 434.5, + "completions/mean_terminated_length": 434.5, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.06972883231876037, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0341796875, + "kl": 0.00414440970052965, + "learning_rate": 6.949308755760369e-06, + "loss": 0.0002, + "num_tokens": 3377484.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 358.875, + "completions/mean_terminated_length": 358.875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.06991330012912747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.006078801321564242, + "learning_rate": 6.967741935483871e-06, + "loss": 0.0002, + "num_tokens": 3384643.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 106.75, + "completions/mean_terminated_length": 106.75, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.07009776793949456, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.796875, + "kl": 0.007344163459492847, + "learning_rate": 6.986175115207375e-06, + "loss": 0.0003, + "num_tokens": 3388313.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 434.25, + "completions/mean_terminated_length": 434.25, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.07028223574986164, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034912109375, + "kl": 0.004899154737358913, + "learning_rate": 7.004608294930876e-06, + "loss": 0.0002, + "num_tokens": 3396683.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 492.25, + "completions/mean_terminated_length": 492.25, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.07046670356022874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.011205047485418618, + "learning_rate": 7.0230414746543785e-06, + "loss": 0.0004, + "num_tokens": 3410477.0, + "reward": 1.2999999523162842, + "reward_std": 0.440778523683548, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.44077855348587036, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 320.5, + "completions/mean_terminated_length": 320.5, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.07065117137059583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.008305327268317342, + "learning_rate": 7.041474654377881e-06, + "loss": 0.0003, + "num_tokens": 3419649.0, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 560.375, + "completions/mean_terminated_length": 347.8571472167969, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.07083563918096292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5390625, + "kl": 0.005096556022181176, + "learning_rate": 7.059907834101382e-06, + "loss": 0.0002, + "num_tokens": 3431308.0, + "reward": 1.125, + "reward_std": 0.4576992690563202, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.11428051441907883, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.07102010699133002, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.1875, + "kl": 0.01666510189534165, + "learning_rate": 7.078341013824886e-06, + "loss": 0.0007, + "num_tokens": 3435237.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 468.75, + "completions/mean_terminated_length": 468.75, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.0712045748016971, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.004176500806352124, + "learning_rate": 7.096774193548388e-06, + "loss": 0.0002, + "num_tokens": 3449651.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 408.375, + "completions/mean_terminated_length": 408.375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.07138904261206419, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.009258315694751218, + "learning_rate": 7.1152073732718896e-06, + "loss": 0.0004, + "num_tokens": 3458262.0, + "reward": 1.125, + "reward_std": 0.05563493072986603, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.055634867399930954, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.07157351042243129, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.006676415679976344, + "learning_rate": 7.133640552995392e-06, + "loss": 0.0003, + "num_tokens": 3467653.0, + "reward": 1.5125000476837158, + "reward_std": 0.3554283678531647, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5125000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.35542842745780945, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 564.625, + "completions/mean_terminated_length": 564.625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.07175797823279838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.0036779690417461097, + "learning_rate": 7.152073732718895e-06, + "loss": 0.0001, + "num_tokens": 3482618.0, + "reward": 1.0125000476837158, + "reward_std": 0.5350974798202515, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13750000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.34934747219085693, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1457.0, + "completions/max_terminated_length": 1457.0, + "completions/mean_length": 368.5, + "completions/mean_terminated_length": 368.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.07194244604316546, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.009317669726442546, + "learning_rate": 7.170506912442397e-06, + "loss": 0.0004, + "num_tokens": 3491942.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.07212691385353256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.010448655200889334, + "learning_rate": 7.188940092165899e-06, + "loss": 0.0004, + "num_tokens": 3501784.0, + "reward": 1.2984693050384521, + "reward_std": 0.6190042495727539, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4234693646430969, + "rewards/fixed_code_pass_all_test_reward/std": 0.37037143111228943, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 444.125, + "completions/mean_terminated_length": 444.125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.07231138166389965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.008311654266435653, + "learning_rate": 7.2073732718894015e-06, + "loss": 0.0003, + "num_tokens": 3510585.0, + "reward": 1.019230842590332, + "reward_std": 0.6011400818824768, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, + "rewards/fixed_code_pass_all_test_reward/std": 0.17804233729839325, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 370.875, + "completions/mean_terminated_length": 370.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.07249584947426674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.0074505830416455865, + "learning_rate": 7.225806451612903e-06, + "loss": 0.0003, + "num_tokens": 3521328.0, + "reward": 1.0879629850387573, + "reward_std": 0.04399021714925766, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08796296268701553, + "rewards/fixed_code_pass_all_test_reward/std": 0.043990183621644974, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 349.625, + "completions/mean_terminated_length": 349.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.07268031728463382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.0050389283715048805, + "learning_rate": 7.244239631336406e-06, + "loss": 0.0002, + "num_tokens": 3531725.0, + "reward": 1.3095238208770752, + "reward_std": 0.4348659813404083, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3095237910747528, + "rewards/fixed_code_pass_all_test_reward/std": 0.4348660111427307, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 202.0, + "completions/mean_terminated_length": 202.0, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.07286478509500093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.0091605992638506, + "learning_rate": 7.262672811059909e-06, + "loss": 0.0004, + "num_tokens": 3538525.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 276.125, + "completions/mean_terminated_length": 276.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.07304925290536801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.00791086972458288, + "learning_rate": 7.28110599078341e-06, + "loss": 0.0003, + "num_tokens": 3546446.0, + "reward": 1.7333333492279053, + "reward_std": 0.28507864475250244, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7333333492279053, + "rewards/fixed_code_pass_all_test_reward/std": 0.28507867455482483, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 697.25, + "completions/mean_terminated_length": 247.0, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.0732337207157351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8203125, + "kl": 0.005798906306154095, + "learning_rate": 7.299539170506913e-06, + "loss": 0.0002, + "num_tokens": 3558944.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.0734181885261022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.007382256560958922, + "learning_rate": 7.317972350230416e-06, + "loss": 0.0003, + "num_tokens": 3564235.0, + "reward": 1.236918568611145, + "reward_std": 0.3379781246185303, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2369185984134674, + "rewards/fixed_code_pass_all_test_reward/std": 0.33797815442085266, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 469.0, + "completions/mean_terminated_length": 469.0, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.07360265633646929, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.0075411254074424505, + "learning_rate": 7.336405529953917e-06, + "loss": 0.0003, + "num_tokens": 3575163.0, + "reward": 1.3181818723678589, + "reward_std": 0.5454545617103577, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4431818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.31280240416526794, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 201.625, + "completions/mean_terminated_length": 201.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.07378712414683637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.00868042593356222, + "learning_rate": 7.35483870967742e-06, + "loss": 0.0003, + "num_tokens": 3580920.0, + "reward": 1.3571429252624512, + "reward_std": 0.41121309995651245, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.41121309995651245, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 287.5, + "completions/mean_terminated_length": 287.5, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.07397159195720347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.006499154143966734, + "learning_rate": 7.373271889400923e-06, + "loss": 0.0003, + "num_tokens": 3587620.0, + "reward": 1.329545497894287, + "reward_std": 0.25684255361557007, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3295454680919647, + "rewards/fixed_code_pass_all_test_reward/std": 0.25684261322021484, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 316.25, + "completions/mean_terminated_length": 316.25, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.07415605976757056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.007306609419174492, + "learning_rate": 7.3917050691244244e-06, + "loss": 0.0003, + "num_tokens": 3594606.0, + "reward": 1.1712963581085205, + "reward_std": 0.1802925169467926, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17129631340503693, + "rewards/fixed_code_pass_all_test_reward/std": 0.1802925169467926, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 185.875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.07434052757793765, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.0058782399573829025, + "learning_rate": 7.410138248847927e-06, + "loss": 0.0002, + "num_tokens": 3599061.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 525.375, + "completions/mean_terminated_length": 525.375, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "epoch": 0.07452499538830475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.0017721121330396272, + "learning_rate": 7.428571428571429e-06, + "loss": 0.0001, + "num_tokens": 3607536.0, + "reward": 1.7083333730697632, + "reward_std": 0.41547447443008423, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.30860668420791626, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 283.25, + "completions/mean_terminated_length": 283.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.07470946319867183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.003414766521018464, + "learning_rate": 7.447004608294931e-06, + "loss": 0.0001, + "num_tokens": 3613674.0, + "reward": 1.890625, + "reward_std": 0.04419417306780815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.890625, + "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 125.875, + "completions/mean_terminated_length": 125.875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.07489393100903892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.007081861549522728, + "learning_rate": 7.465437788018434e-06, + "loss": 0.0003, + "num_tokens": 3617417.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 454.75, + "completions/mean_terminated_length": 454.75, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.07507839881940602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.004671683054766618, + "learning_rate": 7.483870967741936e-06, + "loss": 0.0002, + "num_tokens": 3625655.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1039.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 567.625, + "completions/mean_terminated_length": 567.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.0752628666297731, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.003710546501679346, + "learning_rate": 7.502304147465438e-06, + "loss": 0.0001, + "num_tokens": 3637236.0, + "reward": 1.0208333730697632, + "reward_std": 0.024622410535812378, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, + "rewards/fixed_code_pass_all_test_reward/std": 0.024622369557619095, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 267.375, + "completions/mean_terminated_length": 267.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.07544733444014019, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.011776236293371767, + "learning_rate": 7.52073732718894e-06, + "loss": 0.0005, + "num_tokens": 3643655.0, + "reward": 1.3421052694320679, + "reward_std": 0.47908368706703186, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34210526943206787, + "rewards/fixed_code_pass_all_test_reward/std": 0.47908368706703186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.07563180225050728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.010540796763962135, + "learning_rate": 7.5391705069124435e-06, + "loss": 0.0004, + "num_tokens": 3651035.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 338.375, + "completions/mean_terminated_length": 338.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.07581627006087438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.008632576616946608, + "learning_rate": 7.557603686635945e-06, + "loss": 0.0003, + "num_tokens": 3661590.0, + "reward": 1.1071429252624512, + "reward_std": 0.16642357409000397, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428582072258, + "rewards/fixed_code_pass_all_test_reward/std": 0.16642354428768158, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 445.5, + "completions/mean_terminated_length": 445.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.07600073787124147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.008751891989959404, + "learning_rate": 7.576036866359447e-06, + "loss": 0.0004, + "num_tokens": 3673506.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 340.0, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.07618520568160855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.002681053098058328, + "learning_rate": 7.59447004608295e-06, + "loss": 0.0001, + "num_tokens": 3679602.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 310.625, + "completions/mean_terminated_length": 310.625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.07636967349197565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.007404296571621671, + "learning_rate": 7.612903225806451e-06, + "loss": 0.0003, + "num_tokens": 3688879.0, + "reward": 1.6607143878936768, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535534143447876, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 297.625, + "completions/mean_terminated_length": 297.625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.07655414130234274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.017222604888956994, + "learning_rate": 7.631336405529954e-06, + "loss": 0.0007, + "num_tokens": 3695700.0, + "reward": 0.9510869979858398, + "reward_std": 0.672534167766571, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20108693838119507, + "rewards/fixed_code_pass_all_test_reward/std": 0.3508698642253876, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 161.25, + "completions/mean_terminated_length": 161.25, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.07673860911270983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.00511009362526238, + "learning_rate": 7.649769585253457e-06, + "loss": 0.0002, + "num_tokens": 3699918.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 356.125, + "completions/mean_terminated_length": 356.125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.07692307692307693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.010440759273478761, + "learning_rate": 7.66820276497696e-06, + "loss": 0.0004, + "num_tokens": 3709959.0, + "reward": 1.3510100841522217, + "reward_std": 0.7012073993682861, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4760100841522217, + "rewards/fixed_code_pass_all_test_reward/std": 0.4803008735179901, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.07710754473344401, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.014838128234259784, + "learning_rate": 7.686635944700462e-06, + "loss": 0.0006, + "num_tokens": 3718239.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 749.875, + "completions/mean_terminated_length": 749.875, + "completions/min_length": 620.0, + "completions/min_terminated_length": 620.0, + "epoch": 0.0772920125438111, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.004735645023174584, + "learning_rate": 7.705069124423963e-06, + "loss": 0.0002, + "num_tokens": 3735702.0, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 209.25, + "completions/mean_terminated_length": 209.25, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.0774764803541782, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.007138397195376456, + "learning_rate": 7.723502304147466e-06, + "loss": 0.0003, + "num_tokens": 3744312.0, + "reward": 1.1994680166244507, + "reward_std": 0.35327890515327454, + "rewards/fixed_code_pass_all_test_reward/mean": 0.19946807622909546, + "rewards/fixed_code_pass_all_test_reward/std": 0.35327890515327454, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 392.625, + "completions/mean_terminated_length": 392.625, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.07766094816454529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.0062046901730354875, + "learning_rate": 7.741935483870968e-06, + "loss": 0.0002, + "num_tokens": 3752405.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 201.75, + "completions/mean_terminated_length": 201.75, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.07784541597491237, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.005573209258727729, + "learning_rate": 7.760368663594471e-06, + "loss": 0.0002, + "num_tokens": 3757043.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 445.125, + "completions/mean_terminated_length": 445.125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.07802988378527947, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.005312072375090793, + "learning_rate": 7.778801843317973e-06, + "loss": 0.0002, + "num_tokens": 3770788.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 411.75, + "completions/mean_terminated_length": 411.75, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.07821435159564656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.004795217857463285, + "learning_rate": 7.797235023041474e-06, + "loss": 0.0002, + "num_tokens": 3780842.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.07839881940601365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.009189833799609914, + "learning_rate": 7.815668202764978e-06, + "loss": 0.0004, + "num_tokens": 3785235.0, + "reward": 0.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 191.375, + "completions/mean_terminated_length": 191.375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.07858328721638075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0274658203125, + "kl": 0.003783193475101143, + "learning_rate": 7.83410138248848e-06, + "loss": 0.0002, + "num_tokens": 3789846.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 299.125, + "completions/mean_terminated_length": 299.125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.07876775502674783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.006670542468782514, + "learning_rate": 7.852534562211982e-06, + "loss": 0.0003, + "num_tokens": 3799167.0, + "reward": 1.18478262424469, + "reward_std": 0.5859780311584473, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30978259444236755, + "rewards/fixed_code_pass_all_test_reward/std": 0.3603622019290924, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.07895222283711492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.010415720200398937, + "learning_rate": 7.870967741935484e-06, + "loss": 0.0004, + "num_tokens": 3805229.0, + "reward": 1.0719339847564697, + "reward_std": 0.2034599632024765, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07193396240472794, + "rewards/fixed_code_pass_all_test_reward/std": 0.2034599632024765, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 88.0, + "completions/mean_terminated_length": 88.0, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.07913669064748201, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.007869254244724289, + "learning_rate": 7.889400921658987e-06, + "loss": 0.0003, + "num_tokens": 3808717.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 168.0, + "completions/mean_terminated_length": 168.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.07932115845784911, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.00551656776224263, + "learning_rate": 7.907834101382489e-06, + "loss": 0.0002, + "num_tokens": 3812997.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 385.875, + "completions/mean_terminated_length": 148.42857360839844, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.0795056262682162, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.15625, + "kl": 0.010437879238452297, + "learning_rate": 7.926267281105992e-06, + "loss": 0.0004, + "num_tokens": 3818836.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 319.75, + "completions/mean_terminated_length": 319.75, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.07969009407858328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0076426854357123375, + "learning_rate": 7.944700460829495e-06, + "loss": 0.0003, + "num_tokens": 3827026.0, + "reward": 1.4898648262023926, + "reward_std": 0.2673344314098358, + "rewards/fixed_code_pass_all_test_reward/mean": 0.48986485600471497, + "rewards/fixed_code_pass_all_test_reward/std": 0.2673344612121582, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 659.125, + "completions/mean_terminated_length": 659.125, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "epoch": 0.07987456188895038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.003179049410391599, + "learning_rate": 7.963133640552997e-06, + "loss": 0.0001, + "num_tokens": 3839435.0, + "reward": 1.1583333015441895, + "reward_std": 0.345377653837204, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15833333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.345377653837204, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 337.375, + "completions/mean_terminated_length": 337.375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.08005902969931747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.008061024942435324, + "learning_rate": 7.981566820276498e-06, + "loss": 0.0003, + "num_tokens": 3849790.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 366.25, + "completions/mean_terminated_length": 366.25, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.08024349750968456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.006641245912760496, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0003, + "num_tokens": 3857536.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 250.25, + "completions/mean_terminated_length": 250.25, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.08042796532005166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.01739987311884761, + "learning_rate": 8.018433179723503e-06, + "loss": 0.0007, + "num_tokens": 3865474.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1413.0, + "completions/max_terminated_length": 1413.0, + "completions/mean_length": 496.25, + "completions/mean_terminated_length": 496.25, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.08061243313041874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0060167695992277, + "learning_rate": 8.036866359447006e-06, + "loss": 0.0002, + "num_tokens": 3877508.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.08079690094078583, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.007288697815965861, + "learning_rate": 8.055299539170508e-06, + "loss": 0.0003, + "num_tokens": 3882022.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 463.125, + "completions/mean_terminated_length": 463.125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.08098136875115293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.005428747681435198, + "learning_rate": 8.07373271889401e-06, + "loss": 0.0002, + "num_tokens": 3891407.0, + "reward": 1.2291666269302368, + "reward_std": 0.426665723323822, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.4266657531261444, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.08116583656152002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.007323861646000296, + "learning_rate": 8.092165898617512e-06, + "loss": 0.0003, + "num_tokens": 3897366.0, + "reward": 1.6477272510528564, + "reward_std": 0.48699134588241577, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.48699134588241577, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 438.875, + "completions/mean_terminated_length": 209.00001525878906, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.0813503043718871, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.007362302916590124, + "learning_rate": 8.110599078341016e-06, + "loss": 0.0003, + "num_tokens": 3907069.0, + "reward": 1.128787875175476, + "reward_std": 0.6401147246360779, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2537878751754761, + "rewards/fixed_code_pass_all_test_reward/std": 0.46069079637527466, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 306.625, + "completions/mean_terminated_length": 306.625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0815347721822542, + "frac_reward_zero_std": 0.0, + "grad_norm": 43.75, + "kl": 0.08235446570324712, + "learning_rate": 8.129032258064517e-06, + "loss": 0.0033, + "num_tokens": 3917058.0, + "reward": 1.6062500476837158, + "reward_std": 0.6538007259368896, + "rewards/fixed_code_pass_all_test_reward/mean": 0.731249988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.3058215081691742, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 133.625, + "completions/mean_terminated_length": 133.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.08171923999262129, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.006973760697292164, + "learning_rate": 8.147465437788019e-06, + "loss": 0.0003, + "num_tokens": 3920927.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 350.375, + "completions/mean_terminated_length": 350.375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.08190370780298838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.012409621442202479, + "learning_rate": 8.165898617511522e-06, + "loss": 0.0005, + "num_tokens": 3928426.0, + "reward": 1.4331395626068115, + "reward_std": 0.5681463479995728, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6831395626068115, + "rewards/fixed_code_pass_all_test_reward/std": 0.19255134463310242, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 305.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.08208817561335546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.007150022676796652, + "learning_rate": 8.184331797235023e-06, + "loss": 0.0003, + "num_tokens": 3937975.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 1146.0, + "completions/mean_terminated_length": 1146.0, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "epoch": 0.08227264342372256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5625, + "kl": 0.0022964466115809046, + "learning_rate": 8.202764976958527e-06, + "loss": 0.0001, + "num_tokens": 3960519.0, + "reward": 0.9895833730697632, + "reward_std": 0.5126137733459473, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1145833358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.08245711123408965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.007960955641465262, + "learning_rate": 8.221198156682028e-06, + "loss": 0.0003, + "num_tokens": 3969577.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 318.625, + "completions/mean_terminated_length": 318.625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.08264157904445674, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.009985378885176033, + "learning_rate": 8.23963133640553e-06, + "loss": 0.0004, + "num_tokens": 3980686.0, + "reward": 1.6687500476837158, + "reward_std": 0.7116066217422485, + "rewards/fixed_code_pass_all_test_reward/mean": 0.793749988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.39318978786468506, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 756.125, + "completions/mean_terminated_length": 571.5714721679688, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.08282604685482384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.0033089515927713364, + "learning_rate": 8.258064516129033e-06, + "loss": 0.0001, + "num_tokens": 3992559.0, + "reward": 1.30978262424469, + "reward_std": 0.6864442825317383, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43478259444236755, + "rewards/fixed_code_pass_all_test_reward/std": 0.47114986181259155, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.08301051466519092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.004083815962076187, + "learning_rate": 8.276497695852536e-06, + "loss": 0.0002, + "num_tokens": 3998052.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 289.125, + "completions/mean_terminated_length": 289.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.08319498247555801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.009311292902566493, + "learning_rate": 8.294930875576038e-06, + "loss": 0.0004, + "num_tokens": 4008925.0, + "reward": 1.2407407760620117, + "reward_std": 0.19598153233528137, + "rewards/fixed_code_pass_all_test_reward/mean": 0.24074074625968933, + "rewards/fixed_code_pass_all_test_reward/std": 0.19598159193992615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 354.375, + "completions/mean_terminated_length": 354.375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.08337945028592511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.007063408003887162, + "learning_rate": 8.31336405529954e-06, + "loss": 0.0003, + "num_tokens": 4016800.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 301.125, + "completions/mean_terminated_length": 301.125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.0835639180962922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03515625, + "kl": 0.003827101696515456, + "learning_rate": 8.331797235023043e-06, + "loss": 0.0002, + "num_tokens": 4023041.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 166.0, + "completions/mean_terminated_length": 166.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.08374838590665928, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.01117328746477142, + "learning_rate": 8.350230414746544e-06, + "loss": 0.0004, + "num_tokens": 4027377.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 229.625, + "completions/mean_terminated_length": 229.625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.08393285371702638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.012280484283110127, + "learning_rate": 8.368663594470047e-06, + "loss": 0.0005, + "num_tokens": 4034382.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 151.125, + "completions/mean_terminated_length": 151.125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.08411732152739347, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.007756367325782776, + "learning_rate": 8.387096774193549e-06, + "loss": 0.0003, + "num_tokens": 4038463.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.08430178933776056, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.0048126247129403055, + "learning_rate": 8.40552995391705e-06, + "loss": 0.0002, + "num_tokens": 4043056.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.08448625714812766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.009729965298902243, + "learning_rate": 8.423963133640554e-06, + "loss": 0.0004, + "num_tokens": 4051839.0, + "reward": 1.394230842590332, + "reward_std": 0.24476775527000427, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39423078298568726, + "rewards/fixed_code_pass_all_test_reward/std": 0.24476775527000427, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 318.125, + "completions/mean_terminated_length": 318.125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.08467072495849474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.01290415384573862, + "learning_rate": 8.442396313364057e-06, + "loss": 0.0005, + "num_tokens": 4061352.0, + "reward": 1.4895833730697632, + "reward_std": 0.29693371057510376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4895833432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 365.5, + "completions/mean_terminated_length": 365.5, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.08485519276886183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.023275704821571708, + "learning_rate": 8.460829493087558e-06, + "loss": 0.0009, + "num_tokens": 4069012.0, + "reward": 0.9874999523162842, + "reward_std": 0.34511902928352356, + "rewards/fixed_code_pass_all_test_reward/mean": 0.11250000447034836, + "rewards/fixed_code_pass_all_test_reward/std": 0.06943651288747787, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 160.5, + "completions/mean_terminated_length": 160.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.08503966057922892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.007288396620424464, + "learning_rate": 8.47926267281106e-06, + "loss": 0.0003, + "num_tokens": 4073128.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.08522412838959602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.009763230045791715, + "learning_rate": 8.497695852534563e-06, + "loss": 0.0004, + "num_tokens": 4080876.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 429.125, + "completions/mean_terminated_length": 429.125, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.0854085961999631, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.004037137754494324, + "learning_rate": 8.516129032258065e-06, + "loss": 0.0002, + "num_tokens": 4089293.0, + "reward": 1.5178570747375488, + "reward_std": 0.6169180870056152, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.26816877722740173, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.08559306401033019, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.00536981294862926, + "learning_rate": 8.534562211981568e-06, + "loss": 0.0002, + "num_tokens": 4097988.0, + "reward": 1.625, + "reward_std": 0.4432026147842407, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.08577753182069729, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.006526368291815743, + "learning_rate": 8.55299539170507e-06, + "loss": 0.0003, + "num_tokens": 4105154.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 351.75, + "completions/mean_terminated_length": 351.75, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.08596199963106438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.006330050469841808, + "learning_rate": 8.571428571428571e-06, + "loss": 0.0003, + "num_tokens": 4112616.0, + "reward": 1.125, + "reward_std": 0.23570221662521362, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022613286972, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 173.125, + "completions/mean_terminated_length": 173.125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.08614646744143147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.014002199633978307, + "learning_rate": 8.589861751152074e-06, + "loss": 0.0006, + "num_tokens": 4121177.0, + "reward": 1.1354167461395264, + "reward_std": 0.35055938363075256, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1354166716337204, + "rewards/fixed_code_pass_all_test_reward/std": 0.3505593538284302, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.08633093525179857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.014031477738171816, + "learning_rate": 8.608294930875577e-06, + "loss": 0.0006, + "num_tokens": 4127943.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 233.125, + "completions/mean_terminated_length": 233.125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.08651540306216565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.013643107493408024, + "learning_rate": 8.626728110599079e-06, + "loss": 0.0005, + "num_tokens": 4137448.0, + "reward": 1.1428570747375488, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1428571492433548, + "rewards/fixed_code_pass_all_test_reward/std": 0.3499271273612976, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 271.125, + "completions/mean_terminated_length": 271.125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.08669987087253274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.009878469543764368, + "learning_rate": 8.64516129032258e-06, + "loss": 0.0004, + "num_tokens": 4147113.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 492.25, + "completions/mean_terminated_length": 492.25, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.08688433868289984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.005292469693813473, + "learning_rate": 8.663594470046084e-06, + "loss": 0.0002, + "num_tokens": 4160651.0, + "reward": 1.9027777910232544, + "reward_std": 0.2749859392642975, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 423.875, + "completions/mean_terminated_length": 423.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.08706880649326693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.008746038933168165, + "learning_rate": 8.682027649769585e-06, + "loss": 0.0003, + "num_tokens": 4173666.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 474.125, + "completions/mean_terminated_length": 474.125, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.08725327430363401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.0050069118733517826, + "learning_rate": 8.700460829493088e-06, + "loss": 0.0002, + "num_tokens": 4184979.0, + "reward": 1.774999976158142, + "reward_std": 0.4200340211391449, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 226.75, + "completions/mean_terminated_length": 226.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.08743774211400111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.01655578170903027, + "learning_rate": 8.71889400921659e-06, + "loss": 0.0007, + "num_tokens": 4195329.0, + "reward": 1.1590909957885742, + "reward_std": 0.06428244709968567, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15909090638160706, + "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 254.125, + "completions/mean_terminated_length": 254.125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.0876222099243682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.012738831341266632, + "learning_rate": 8.737327188940093e-06, + "loss": 0.0005, + "num_tokens": 4203946.0, + "reward": 1.0299999713897705, + "reward_std": 0.08485280722379684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.029999999329447746, + "rewards/fixed_code_pass_all_test_reward/std": 0.08485281467437744, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 396.5, + "completions/mean_terminated_length": 396.5, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.08780667773473529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.01002240157686174, + "learning_rate": 8.755760368663595e-06, + "loss": 0.0004, + "num_tokens": 4217630.0, + "reward": 1.0184426307678223, + "reward_std": 0.052163634449243546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01844262331724167, + "rewards/fixed_code_pass_all_test_reward/std": 0.052163612097501755, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 150.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.08799114554510237, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.007251915172673762, + "learning_rate": 8.774193548387098e-06, + "loss": 0.0003, + "num_tokens": 4221741.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 163.875, + "completions/mean_terminated_length": 163.875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.08817561335546947, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.0037967060925439, + "learning_rate": 8.7926267281106e-06, + "loss": 0.0002, + "num_tokens": 4225964.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 574.625, + "completions/mean_terminated_length": 364.14288330078125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.08836008116583656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80859375, + "kl": 0.007972997380420566, + "learning_rate": 8.811059907834103e-06, + "loss": 0.0003, + "num_tokens": 4235209.0, + "reward": 1.1530611515045166, + "reward_std": 0.5718449354171753, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2780612111091614, + "rewards/fixed_code_pass_all_test_reward/std": 0.3500864803791046, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.08854454897620365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.006501436437247321, + "learning_rate": 8.829493087557604e-06, + "loss": 0.0003, + "num_tokens": 4240387.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.08872901678657075, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.016169553971849382, + "learning_rate": 8.847926267281107e-06, + "loss": 0.0006, + "num_tokens": 4246950.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 370.0, + "completions/mean_terminated_length": 370.0, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.08891348459693783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.013593474985100329, + "learning_rate": 8.866359447004609e-06, + "loss": 0.0005, + "num_tokens": 4257158.0, + "reward": 1.351063847541809, + "reward_std": 0.41131263971328735, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4760638177394867, + "rewards/fixed_code_pass_all_test_reward/std": 0.32979950308799744, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.08909795240730492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.018375163315795362, + "learning_rate": 8.884792626728112e-06, + "loss": 0.0007, + "num_tokens": 4268570.0, + "reward": 1.389423131942749, + "reward_std": 0.37030377984046936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.38942307233810425, + "rewards/fixed_code_pass_all_test_reward/std": 0.37030377984046936, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 290.625, + "completions/mean_terminated_length": 290.625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.08928242021767202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.011039349192287773, + "learning_rate": 8.903225806451614e-06, + "loss": 0.0004, + "num_tokens": 4275039.0, + "reward": 1.663461446762085, + "reward_std": 0.13598209619522095, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6634615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.13598206639289856, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 383.75, + "completions/mean_terminated_length": 383.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.08946688802803911, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.009812530159251764, + "learning_rate": 8.921658986175115e-06, + "loss": 0.0004, + "num_tokens": 4283397.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 478.5, + "completions/mean_terminated_length": 478.5, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.0896513558384062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.008848927856888622, + "learning_rate": 8.940092165898619e-06, + "loss": 0.0004, + "num_tokens": 4295729.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.0898358236487733, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.023507019854150712, + "learning_rate": 8.958525345622122e-06, + "loss": 0.0009, + "num_tokens": 4303727.0, + "reward": 1.7625000476837158, + "reward_std": 0.219983771443367, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7625000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.2199837565422058, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.09002029145914038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.015778631321154535, + "learning_rate": 8.976958525345623e-06, + "loss": 0.0006, + "num_tokens": 4312524.0, + "reward": 1.6931817531585693, + "reward_std": 0.42414578795433044, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6931818723678589, + "rewards/fixed_code_pass_all_test_reward/std": 0.42414578795433044, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 653.0, + "completions/mean_terminated_length": 653.0, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.09020475926950747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.008133421768434346, + "learning_rate": 8.995391705069125e-06, + "loss": 0.0003, + "num_tokens": 4328804.0, + "reward": 0.9107142686843872, + "reward_std": 0.506157398223877, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1607142835855484, + "rewards/fixed_code_pass_all_test_reward/std": 0.17806050181388855, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 191.0, + "completions/mean_terminated_length": 191.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.09038922707987457, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.013259486702736467, + "learning_rate": 9.013824884792628e-06, + "loss": 0.0005, + "num_tokens": 4333228.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 242.375, + "completions/mean_terminated_length": 242.375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.09057369489024165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.02614689970505424, + "learning_rate": 9.03225806451613e-06, + "loss": 0.001, + "num_tokens": 4339495.0, + "reward": 1.4666666984558105, + "reward_std": 0.1885617971420288, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46666669845581055, + "rewards/fixed_code_pass_all_test_reward/std": 0.1885618269443512, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 268.25, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.09075816270060874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.021704821148887277, + "learning_rate": 9.050691244239633e-06, + "loss": 0.0009, + "num_tokens": 4349889.0, + "reward": 1.3125, + "reward_std": 0.2912411689758301, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.29124119877815247, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.09094263051097584, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.015625, + "kl": 0.01827052456792444, + "learning_rate": 9.069124423963134e-06, + "loss": 0.0007, + "num_tokens": 4358312.0, + "reward": 1.6454325914382935, + "reward_std": 0.40258297324180603, + "rewards/fixed_code_pass_all_test_reward/mean": 0.770432710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204118013381958, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 769.625, + "completions/mean_terminated_length": 769.625, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "epoch": 0.09112709832134293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.006645219080382958, + "learning_rate": 9.087557603686636e-06, + "loss": 0.0003, + "num_tokens": 4376293.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 403.625, + "completions/mean_terminated_length": 403.625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.09131156613171001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.020066366530954838, + "learning_rate": 9.105990783410139e-06, + "loss": 0.0008, + "num_tokens": 4387354.0, + "reward": 1.0499999523162842, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05000000074505806, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 638.875, + "completions/mean_terminated_length": 638.875, + "completions/min_length": 603.0, + "completions/min_terminated_length": 603.0, + "epoch": 0.0914960339420771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6171875, + "kl": 0.006086960493121296, + "learning_rate": 9.124423963133642e-06, + "loss": 0.0002, + "num_tokens": 4399681.0, + "reward": 1.3161765336990356, + "reward_std": 0.12453576177358627, + "rewards/fixed_code_pass_all_test_reward/mean": 0.31617647409439087, + "rewards/fixed_code_pass_all_test_reward/std": 0.12453572452068329, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.0916805017524442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.009968580154236406, + "learning_rate": 9.142857142857144e-06, + "loss": 0.0004, + "num_tokens": 4408686.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 115.75, + "completions/mean_terminated_length": 115.75, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.09186496956281129, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.921875, + "kl": 0.017985473736189306, + "learning_rate": 9.161290322580645e-06, + "loss": 0.0007, + "num_tokens": 4412284.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.09204943737317837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.00770829024259001, + "learning_rate": 9.179723502304149e-06, + "loss": 0.0003, + "num_tokens": 4417731.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 289.125, + "completions/mean_terminated_length": 289.125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.09223390518354548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.016197526594623923, + "learning_rate": 9.19815668202765e-06, + "loss": 0.0006, + "num_tokens": 4425468.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 230.25, + "completions/mean_terminated_length": 230.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.09241837299391256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.010029495519120246, + "learning_rate": 9.216589861751153e-06, + "loss": 0.0004, + "num_tokens": 4430294.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 391.375, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.09260284080427965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.00859968806616962, + "learning_rate": 9.235023041474655e-06, + "loss": 0.0003, + "num_tokens": 4438401.0, + "reward": 1.7708333730697632, + "reward_std": 0.27706217765808105, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.27706217765808105, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.09278730861464675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.015643441933207214, + "learning_rate": 9.253456221198156e-06, + "loss": 0.0006, + "num_tokens": 4444701.0, + "reward": 1.4202585220336914, + "reward_std": 0.27477267384529114, + "rewards/fixed_code_pass_all_test_reward/mean": 0.42025864124298096, + "rewards/fixed_code_pass_all_test_reward/std": 0.2747727334499359, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 296.875, + "completions/mean_terminated_length": 296.875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.09297177642501384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.009262629377190024, + "learning_rate": 9.27188940092166e-06, + "loss": 0.0004, + "num_tokens": 4451732.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 233.125, + "completions/mean_terminated_length": 233.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.09315624423538092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.015469123609364033, + "learning_rate": 9.290322580645163e-06, + "loss": 0.0006, + "num_tokens": 4460181.0, + "reward": 1.043269157409668, + "reward_std": 0.059717193245887756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.04326923191547394, + "rewards/fixed_code_pass_all_test_reward/std": 0.05971721187233925, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 139.75, + "completions/mean_terminated_length": 139.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.09334071204574802, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.029780214419588447, + "learning_rate": 9.308755760368664e-06, + "loss": 0.0012, + "num_tokens": 4464003.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 320.75, + "completions/mean_terminated_length": 320.75, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.09352517985611511, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034423828125, + "kl": 0.006179669551784173, + "learning_rate": 9.327188940092166e-06, + "loss": 0.0002, + "num_tokens": 4469817.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 283.5, + "completions/mean_terminated_length": 283.5, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.0937096476664822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.012020637281239033, + "learning_rate": 9.34562211981567e-06, + "loss": 0.0005, + "num_tokens": 4478661.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.0938941154768493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.012052517209667712, + "learning_rate": 9.36405529953917e-06, + "loss": 0.0005, + "num_tokens": 4483110.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 386.125, + "completions/mean_terminated_length": 386.125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.09407858328721638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.006277043285081163, + "learning_rate": 9.382488479262674e-06, + "loss": 0.0003, + "num_tokens": 4491287.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 421.625, + "completions/mean_terminated_length": 421.625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.09426305109758347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.01542604630230926, + "learning_rate": 9.400921658986176e-06, + "loss": 0.0006, + "num_tokens": 4502228.0, + "reward": 1.8406250476837158, + "reward_std": 0.2146165668964386, + "rewards/fixed_code_pass_all_test_reward/mean": 0.840624988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.2146165817975998, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.09444751890795056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.022664624731987715, + "learning_rate": 9.419354838709677e-06, + "loss": 0.0009, + "num_tokens": 4510765.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 231.0, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.09463198671831766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.013284680084325373, + "learning_rate": 9.43778801843318e-06, + "loss": 0.0005, + "num_tokens": 4519541.0, + "reward": 1.5735294818878174, + "reward_std": 0.42037805914878845, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5735294222831726, + "rewards/fixed_code_pass_all_test_reward/std": 0.4203781187534332, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.09481645452868474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.019034119206480682, + "learning_rate": 9.456221198156684e-06, + "loss": 0.0008, + "num_tokens": 4525134.0, + "reward": 1.6759867668151855, + "reward_std": 0.27356648445129395, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6759868264198303, + "rewards/fixed_code_pass_all_test_reward/std": 0.27356648445129395, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.09500092233905183, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.010230286192381755, + "learning_rate": 9.474654377880185e-06, + "loss": 0.0004, + "num_tokens": 4530347.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 159.75, + "completions/mean_terminated_length": 159.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.09518539014941893, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.019840551540255547, + "learning_rate": 9.493087557603687e-06, + "loss": 0.0008, + "num_tokens": 4534361.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 693.875, + "completions/mean_terminated_length": 693.875, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.09536985795978602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.008218316215788946, + "learning_rate": 9.51152073732719e-06, + "loss": 0.0003, + "num_tokens": 4549136.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 465.75, + "completions/mean_terminated_length": 465.75, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.0955543257701531, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.012049072829540819, + "learning_rate": 9.529953917050691e-06, + "loss": 0.0005, + "num_tokens": 4558054.0, + "reward": 1.4642857313156128, + "reward_std": 0.44361352920532227, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.44361358880996704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 427.125, + "completions/mean_terminated_length": 427.125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.0957387935805202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.00742917726165615, + "learning_rate": 9.548387096774195e-06, + "loss": 0.0003, + "num_tokens": 4568471.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 276.375, + "completions/mean_terminated_length": 276.375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.09592326139088729, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.01198087417287752, + "learning_rate": 9.566820276497696e-06, + "loss": 0.0005, + "num_tokens": 4576826.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 548.375, + "completions/mean_terminated_length": 548.375, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.09610772920125438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.004077940568095073, + "learning_rate": 9.5852534562212e-06, + "loss": 0.0002, + "num_tokens": 4589037.0, + "reward": 1.784999966621399, + "reward_std": 0.40507495403289795, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7849999666213989, + "rewards/fixed_code_pass_all_test_reward/std": 0.40507495403289795, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 338.0, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.09629219701162148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.005467346258228645, + "learning_rate": 9.603686635944701e-06, + "loss": 0.0002, + "num_tokens": 4599037.0, + "reward": 1.7801203727722168, + "reward_std": 0.06134949252009392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7801204919815063, + "rewards/fixed_code_pass_all_test_reward/std": 0.0613495409488678, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 439.375, + "completions/mean_terminated_length": 439.375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.09647666482198856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1572265625, + "kl": 0.021775180648546666, + "learning_rate": 9.622119815668204e-06, + "loss": 0.0009, + "num_tokens": 4611288.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 383.75, + "completions/mean_terminated_length": 383.75, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.09666113263235565, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.015284803317626938, + "learning_rate": 9.640552995391706e-06, + "loss": 0.0006, + "num_tokens": 4617878.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 340.375, + "completions/mean_terminated_length": 340.375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.09684560044272275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.004679077537730336, + "learning_rate": 9.658986175115209e-06, + "loss": 0.0002, + "num_tokens": 4626841.0, + "reward": 1.8499999046325684, + "reward_std": 0.09258202463388443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 579.875, + "completions/mean_terminated_length": 579.875, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "epoch": 0.09703006825308984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.65625, + "kl": 0.003018162358785048, + "learning_rate": 9.67741935483871e-06, + "loss": 0.0001, + "num_tokens": 4640264.0, + "reward": 1.774999976158142, + "reward_std": 0.15507294237613678, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.15507294237613678, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 306.0, + "completions/mean_terminated_length": 306.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.09721453606345692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.006314793601632118, + "learning_rate": 9.695852534562212e-06, + "loss": 0.0003, + "num_tokens": 4649088.0, + "reward": 1.8333332538604736, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1938.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 968.625, + "completions/mean_terminated_length": 968.625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.09739900387382401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6484375, + "kl": 0.0034916122094728053, + "learning_rate": 9.714285714285715e-06, + "loss": 0.0001, + "num_tokens": 4663909.0, + "reward": 1.396484375, + "reward_std": 0.7335879802703857, + "rewards/fixed_code_pass_all_test_reward/mean": 0.521484375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5139608979225159, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 399.0, + "completions/mean_terminated_length": 399.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.09758347168419111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.012409394374117255, + "learning_rate": 9.732718894009218e-06, + "loss": 0.0005, + "num_tokens": 4672829.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 230.5, + "completions/mean_terminated_length": 230.5, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.0977679394945582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.014863356482237577, + "learning_rate": 9.75115207373272e-06, + "loss": 0.0006, + "num_tokens": 4682753.0, + "reward": 1.3894927501678467, + "reward_std": 0.4045705795288086, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3894927501678467, + "rewards/fixed_code_pass_all_test_reward/std": 0.4045705795288086, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 293.875, + "completions/mean_terminated_length": 293.875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.09795240730492528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.03738776163663715, + "learning_rate": 9.769585253456221e-06, + "loss": 0.0015, + "num_tokens": 4692536.0, + "reward": 1.274999976158142, + "reward_std": 0.45276927947998047, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2750000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.45276927947998047, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 335.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.09813687511529239, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.023198360693641007, + "learning_rate": 9.788018433179725e-06, + "loss": 0.0009, + "num_tokens": 4701891.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 147.375, + "completions/mean_terminated_length": 147.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.09832134292565947, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.71875, + "kl": 0.026852728566154838, + "learning_rate": 9.806451612903226e-06, + "loss": 0.0011, + "num_tokens": 4705750.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 309.375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.09850581073602656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.008462803525617346, + "learning_rate": 9.82488479262673e-06, + "loss": 0.0003, + "num_tokens": 4714313.0, + "reward": 1.451923131942749, + "reward_std": 0.3143174350261688, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5769230723381042, + "rewards/fixed_code_pass_all_test_reward/std": 0.3076923191547394, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 405.375, + "completions/mean_terminated_length": 405.375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.09869027854639366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.00855463364860043, + "learning_rate": 9.843317972350231e-06, + "loss": 0.0003, + "num_tokens": 4722204.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 372.625, + "completions/mean_terminated_length": 372.625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.09887474635676075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.024772272241534665, + "learning_rate": 9.861751152073733e-06, + "loss": 0.001, + "num_tokens": 4731697.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 279.25, + "completions/mean_terminated_length": 279.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.09905921416712783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.015534053789451718, + "learning_rate": 9.880184331797236e-06, + "loss": 0.0006, + "num_tokens": 4739891.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 528.625, + "completions/mean_terminated_length": 528.625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.09924368197749493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.009997292479965836, + "learning_rate": 9.898617511520739e-06, + "loss": 0.0004, + "num_tokens": 4753312.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 331.375, + "completions/mean_terminated_length": 331.375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.09942814978786202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.011408571677748114, + "learning_rate": 9.91705069124424e-06, + "loss": 0.0005, + "num_tokens": 4763083.0, + "reward": 1.21875, + "reward_std": 0.348590224981308, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.018483906984329224, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 281.375, + "completions/mean_terminated_length": 281.375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.0996126175982291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.01086769945686683, + "learning_rate": 9.935483870967742e-06, + "loss": 0.0004, + "num_tokens": 4768294.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 153.5, + "completions/mean_terminated_length": 153.5, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.0997970854085962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.80078125, + "kl": 0.06066339008975774, + "learning_rate": 9.953917050691245e-06, + "loss": 0.0024, + "num_tokens": 4772362.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 381.375, + "completions/mean_terminated_length": 381.375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.09998155321896329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.009474723774474114, + "learning_rate": 9.972350230414747e-06, + "loss": 0.0004, + "num_tokens": 4783197.0, + "reward": 1.329861044883728, + "reward_std": 0.1272396594285965, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3298611044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.1272396296262741, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.10016602102933038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.018227670050691813, + "learning_rate": 9.99078341013825e-06, + "loss": 0.0007, + "num_tokens": 4793279.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 404.5, + "completions/mean_terminated_length": 404.5, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.10035048883969747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.011913210444618016, + "learning_rate": 1.0009216589861752e-05, + "loss": 0.0005, + "num_tokens": 4801563.0, + "reward": 1.15625, + "reward_std": 0.06681530922651291, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, + "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 365.625, + "completions/mean_terminated_length": 365.625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.10053495665006457, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.224609375, + "kl": 0.014751313021406531, + "learning_rate": 1.0027649769585255e-05, + "loss": 0.0006, + "num_tokens": 4809440.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 241.75, + "completions/mean_terminated_length": 241.75, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.10071942446043165, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.008262459567049518, + "learning_rate": 1.0046082949308758e-05, + "loss": 0.0003, + "num_tokens": 4814430.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 225.625, + "completions/mean_terminated_length": 225.625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.10090389227079874, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.017391397792380303, + "learning_rate": 1.0064516129032258e-05, + "loss": 0.0007, + "num_tokens": 4820131.0, + "reward": 1.515625, + "reward_std": 0.5194326043128967, + "rewards/fixed_code_pass_all_test_reward/mean": 0.515625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5194326639175415, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 350.125, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.10108836008116584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8828125, + "kl": 0.0258788182400167, + "learning_rate": 1.0082949308755761e-05, + "loss": 0.001, + "num_tokens": 4830180.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 328.75, + "completions/mean_terminated_length": 328.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.10127282789153293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.013796435610856861, + "learning_rate": 1.0101382488479263e-05, + "loss": 0.0006, + "num_tokens": 4838914.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.10145729570190001, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.004797032437636517, + "learning_rate": 1.0119815668202766e-05, + "loss": 0.0002, + "num_tokens": 4844145.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 303.75, + "completions/mean_terminated_length": 303.75, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.10164176351226711, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.02610538701992482, + "learning_rate": 1.0138248847926269e-05, + "loss": 0.001, + "num_tokens": 4853927.0, + "reward": 1.60546875, + "reward_std": 0.4218956530094147, + "rewards/fixed_code_pass_all_test_reward/mean": 0.60546875, + "rewards/fixed_code_pass_all_test_reward/std": 0.42189568281173706, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 220.75, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.1018262313226342, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0291748046875, + "kl": 0.005364995478885248, + "learning_rate": 1.015668202764977e-05, + "loss": 0.0002, + "num_tokens": 4858413.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 274.375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.10201069913300129, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.027199636911973357, + "learning_rate": 1.0175115207373272e-05, + "loss": 0.0011, + "num_tokens": 4866000.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 363.875, + "completions/mean_terminated_length": 363.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.10219516694336839, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.015301419596653432, + "learning_rate": 1.0193548387096774e-05, + "loss": 0.0006, + "num_tokens": 4874199.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 224.625, + "completions/mean_terminated_length": 224.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.10237963475373547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.032321638660505414, + "learning_rate": 1.0211981566820277e-05, + "loss": 0.0013, + "num_tokens": 4880036.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 428.25, + "completions/mean_terminated_length": 428.25, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.10256410256410256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.009116288798395544, + "learning_rate": 1.023041474654378e-05, + "loss": 0.0004, + "num_tokens": 4890758.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 154.125, + "completions/mean_terminated_length": 154.125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.10274857037446966, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.025524010183289647, + "learning_rate": 1.0248847926267282e-05, + "loss": 0.001, + "num_tokens": 4894863.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 321.75, + "completions/mean_terminated_length": 321.75, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.10293303818483675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2119140625, + "kl": 0.027910651348065585, + "learning_rate": 1.0267281105990785e-05, + "loss": 0.0011, + "num_tokens": 4902597.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.10311750599520383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.02021293924190104, + "learning_rate": 1.0285714285714285e-05, + "loss": 0.0008, + "num_tokens": 4908443.0, + "reward": 1.2916666269302368, + "reward_std": 0.03450329229235649, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0345032773911953, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 388.0, + "completions/mean_terminated_length": 388.0, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.10330197380557093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.009626896004192531, + "learning_rate": 1.0304147465437788e-05, + "loss": 0.0004, + "num_tokens": 4916147.0, + "reward": 1.90625, + "reward_std": 0.03788074478507042, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.03788072615861893, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 640.75, + "completions/mean_terminated_length": 439.71429443359375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.10348644161593802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.55078125, + "kl": 0.005345965866581537, + "learning_rate": 1.0322580645161291e-05, + "loss": 0.0002, + "num_tokens": 4929937.0, + "reward": 1.2833333015441895, + "reward_std": 0.5185449719429016, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40833333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.1649915874004364, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 286.125, + "completions/mean_terminated_length": 286.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.10367090942630511, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.011743417038815096, + "learning_rate": 1.0341013824884793e-05, + "loss": 0.0005, + "num_tokens": 4935042.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 209.25, + "completions/mean_terminated_length": 209.25, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.1038553772366722, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.02816119126509875, + "learning_rate": 1.0359447004608296e-05, + "loss": 0.0011, + "num_tokens": 4943764.0, + "reward": 1.046875, + "reward_std": 0.03477181866765022, + "rewards/fixed_code_pass_all_test_reward/mean": 0.046875, + "rewards/fixed_code_pass_all_test_reward/std": 0.034771788865327835, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 859.375, + "completions/mean_terminated_length": 859.375, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "epoch": 0.1040398450470393, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.006007810065057129, + "learning_rate": 1.03778801843318e-05, + "loss": 0.0002, + "num_tokens": 4961175.0, + "reward": 1.0625, + "reward_std": 0.9082404375076294, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4374999701976776, + "rewards/fixed_code_pass_all_test_reward/std": 0.426665723323822, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 325.375, + "completions/mean_terminated_length": 325.375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.10422431285740638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.002705598730244674, + "learning_rate": 1.0396313364055299e-05, + "loss": 0.0001, + "num_tokens": 4967762.0, + "reward": 1.9375, + "reward_std": 0.1157275140285492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 759.5, + "completions/mean_terminated_length": 759.5, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "epoch": 0.10440878066777347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.007965603726916015, + "learning_rate": 1.0414746543778802e-05, + "loss": 0.0003, + "num_tokens": 4984398.0, + "reward": 1.5625, + "reward_std": 0.4955156147480011, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 311.375, + "completions/mean_terminated_length": 311.375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.10459324847814057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031982421875, + "kl": 0.003391963335161563, + "learning_rate": 1.0433179723502306e-05, + "loss": 0.0001, + "num_tokens": 4990913.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 168.5, + "completions/mean_terminated_length": 168.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.10477771628850766, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.71875, + "kl": 0.011580035963561386, + "learning_rate": 1.0451612903225807e-05, + "loss": 0.0005, + "num_tokens": 4995021.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 282.75, + "completions/mean_terminated_length": 282.75, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.10496218409887474, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.01358648284804076, + "learning_rate": 1.047004608294931e-05, + "loss": 0.0005, + "num_tokens": 5001659.0, + "reward": 1.8242753744125366, + "reward_std": 0.08177722245454788, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8242753744125366, + "rewards/fixed_code_pass_all_test_reward/std": 0.08177726715803146, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.10514665190924184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.03694136347621679, + "learning_rate": 1.0488479262672814e-05, + "loss": 0.0015, + "num_tokens": 5010368.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 202.75, + "completions/mean_terminated_length": 202.75, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.10533111971960893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.012418287515174598, + "learning_rate": 1.0506912442396313e-05, + "loss": 0.0005, + "num_tokens": 5014950.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 209.375, + "completions/mean_terminated_length": 209.375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.10551558752997602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.008490026375511661, + "learning_rate": 1.0525345622119817e-05, + "loss": 0.0003, + "num_tokens": 5019489.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 332.25, + "completions/mean_terminated_length": 332.25, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.10570005534034312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.021591264056041837, + "learning_rate": 1.0543778801843318e-05, + "loss": 0.0009, + "num_tokens": 5031435.0, + "reward": 1.0499999523162842, + "reward_std": 0.09258202463388443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05000000074505806, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 220.125, + "completions/mean_terminated_length": 220.125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.1058845231507102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.03303455375134945, + "learning_rate": 1.0562211981566821e-05, + "loss": 0.0013, + "num_tokens": 5038716.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 255.0, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.10606899096107729, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.02922314265742898, + "learning_rate": 1.0580645161290325e-05, + "loss": 0.0012, + "num_tokens": 5048540.0, + "reward": 1.4583333730697632, + "reward_std": 0.669061541557312, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.3949388563632965, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 104.375, + "completions/mean_terminated_length": 104.375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.10625345877144439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.023002833884675056, + "learning_rate": 1.0599078341013826e-05, + "loss": 0.0009, + "num_tokens": 5052823.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 530.125, + "completions/mean_terminated_length": 530.125, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.10643792658181148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.01133900583954528, + "learning_rate": 1.061751152073733e-05, + "loss": 0.0005, + "num_tokens": 5067360.0, + "reward": 0.26923078298568726, + "reward_std": 0.49851855635643005, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01923076994717121, + "rewards/fixed_code_pass_all_test_reward/std": 0.03560846671462059, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 201.375, + "completions/mean_terminated_length": 201.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.10662239439217856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.03943873860407621, + "learning_rate": 1.063594470046083e-05, + "loss": 0.0016, + "num_tokens": 5074867.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 233.375, + "completions/mean_terminated_length": 233.375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.10680686220254565, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.024694595718756318, + "learning_rate": 1.0654377880184332e-05, + "loss": 0.001, + "num_tokens": 5084270.0, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 289.25, + "completions/mean_terminated_length": 289.25, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.10699133001291275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.02643735590390861, + "learning_rate": 1.0672811059907836e-05, + "loss": 0.0011, + "num_tokens": 5093064.0, + "reward": 1.7060810327529907, + "reward_std": 0.33397406339645386, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7060810327529907, + "rewards/fixed_code_pass_all_test_reward/std": 0.33397406339645386, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.10717579782327984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.021516600623726845, + "learning_rate": 1.0691244239631337e-05, + "loss": 0.0009, + "num_tokens": 5102137.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 311.375, + "completions/mean_terminated_length": 311.375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.10736026563364692, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032470703125, + "kl": 0.004570527788018808, + "learning_rate": 1.070967741935484e-05, + "loss": 0.0002, + "num_tokens": 5108540.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.10754473344401402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.022146267001517117, + "learning_rate": 1.0728110599078344e-05, + "loss": 0.0009, + "num_tokens": 5115203.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 454.125, + "completions/mean_terminated_length": 454.125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.10772920125438111, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.017787566874176264, + "learning_rate": 1.0746543778801843e-05, + "loss": 0.0007, + "num_tokens": 5127564.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 220.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.1079136690647482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.029783689766190946, + "learning_rate": 1.0764976958525347e-05, + "loss": 0.0012, + "num_tokens": 5133500.0, + "reward": 0.9524999856948853, + "reward_std": 0.29798126220703125, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07750000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.06713525950908661, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 304.625, + "completions/mean_terminated_length": 304.625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.1080981368751153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.015450290171429515, + "learning_rate": 1.0783410138248848e-05, + "loss": 0.0006, + "num_tokens": 5142377.0, + "reward": 1.7265625, + "reward_std": 0.4251280128955841, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7265625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4251280128955841, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 244.75, + "completions/mean_terminated_length": 244.75, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.10828260468548238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.029802936245687306, + "learning_rate": 1.0801843317972351e-05, + "loss": 0.0012, + "num_tokens": 5152255.0, + "reward": 1.8952702283859253, + "reward_std": 0.19962267577648163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8952702283859253, + "rewards/fixed_code_pass_all_test_reward/std": 0.19962267577648163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 224.0, + "completions/mean_terminated_length": 224.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.10846707249584947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.02647989382967353, + "learning_rate": 1.0820276497695855e-05, + "loss": 0.0011, + "num_tokens": 5161055.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 401.0, + "completions/mean_terminated_length": 401.0, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.10865154030621657, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.01807076029945165, + "learning_rate": 1.0838709677419356e-05, + "loss": 0.0007, + "num_tokens": 5169391.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 224.0, + "completions/mean_terminated_length": 224.0, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.10883600811658366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.006720381221384741, + "learning_rate": 1.0857142857142858e-05, + "loss": 0.0003, + "num_tokens": 5174319.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 554.5, + "completions/mean_terminated_length": 554.5, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.10902047592695074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.007886221806984395, + "learning_rate": 1.087557603686636e-05, + "loss": 0.0003, + "num_tokens": 5184907.0, + "reward": 1.9444444179534912, + "reward_std": 0.11878276616334915, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, + "rewards/fixed_code_pass_all_test_reward/std": 0.11878277361392975, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 207.25, + "completions/mean_terminated_length": 207.25, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.10920494373731784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.029293954838067293, + "learning_rate": 1.0894009216589863e-05, + "loss": 0.0012, + "num_tokens": 5190653.0, + "reward": 1.05978262424469, + "reward_std": 0.15228478610515594, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05978260561823845, + "rewards/fixed_code_pass_all_test_reward/std": 0.15228478610515594, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 232.875, + "completions/mean_terminated_length": 232.875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.10938941154768493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.024786977330222726, + "learning_rate": 1.0912442396313366e-05, + "loss": 0.001, + "num_tokens": 5198900.0, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 276.875, + "completions/mean_terminated_length": 276.875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.10957387935805202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.01650290295947343, + "learning_rate": 1.0930875576036867e-05, + "loss": 0.0007, + "num_tokens": 5205091.0, + "reward": 1.165000081062317, + "reward_std": 0.04242644086480141, + "rewards/fixed_code_pass_all_test_reward/mean": 0.16500000655651093, + "rewards/fixed_code_pass_all_test_reward/std": 0.04242641106247902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 237.375, + "completions/mean_terminated_length": 237.375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.1097583471684191, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.03848157008178532, + "learning_rate": 1.094930875576037e-05, + "loss": 0.0015, + "num_tokens": 5214942.0, + "reward": 1.2687499523162842, + "reward_std": 0.6099985837936401, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39375001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.3668763041496277, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.1099428149787862, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.01824515702901408, + "learning_rate": 1.096774193548387e-05, + "loss": 0.0007, + "num_tokens": 5223324.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 147.25, + "completions/mean_terminated_length": 147.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.11012728278915329, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.3125, + "kl": 0.02129334374330938, + "learning_rate": 1.0986175115207374e-05, + "loss": 0.0009, + "num_tokens": 5227550.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 148.0, + "completions/mean_terminated_length": 148.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.11031175059952038, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.033238760370295495, + "learning_rate": 1.1004608294930877e-05, + "loss": 0.0013, + "num_tokens": 5231478.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.11049621840988748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.028226650087162852, + "learning_rate": 1.1023041474654378e-05, + "loss": 0.0011, + "num_tokens": 5237934.0, + "reward": 1.438829779624939, + "reward_std": 0.6352447271347046, + "rewards/fixed_code_pass_all_test_reward/mean": 0.563829779624939, + "rewards/fixed_code_pass_all_test_reward/std": 0.3426975905895233, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 518.0, + "completions/mean_terminated_length": 518.0, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.11068068622025456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.013978310453239828, + "learning_rate": 1.1041474654377882e-05, + "loss": 0.0006, + "num_tokens": 5253166.0, + "reward": 1.1607141494750977, + "reward_std": 0.4734187722206116, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.3148418068885803, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 224.25, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.11086515403062165, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.024668479105457664, + "learning_rate": 1.1059907834101385e-05, + "loss": 0.001, + "num_tokens": 5259232.0, + "reward": 1.5384615659713745, + "reward_std": 0.6014915108680725, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.30423885583877563, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 205.625, + "completions/mean_terminated_length": 205.625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.11104962184098875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.02973625552840531, + "learning_rate": 1.1078341013824885e-05, + "loss": 0.0012, + "num_tokens": 5268149.0, + "reward": 1.1287128925323486, + "reward_std": 0.640127956867218, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25371289253234863, + "rewards/fixed_code_pass_all_test_reward/std": 0.4607324004173279, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 182.625, + "completions/mean_terminated_length": 182.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.11123408965135584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.05151755781844258, + "learning_rate": 1.1096774193548388e-05, + "loss": 0.0021, + "num_tokens": 5274778.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 223.25, + "completions/mean_terminated_length": 223.25, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.11141855746172293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.03009527293033898, + "learning_rate": 1.111520737327189e-05, + "loss": 0.0012, + "num_tokens": 5284420.0, + "reward": 1.6722973585128784, + "reward_std": 0.34677374362945557, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6722972989082336, + "rewards/fixed_code_pass_all_test_reward/std": 0.34677374362945557, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 403.25, + "completions/mean_terminated_length": 403.25, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.11160302527209003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.014786576968617737, + "learning_rate": 1.1133640552995393e-05, + "loss": 0.0006, + "num_tokens": 5293350.0, + "reward": 1.3229167461395264, + "reward_std": 0.3225564956665039, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4479166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.36983880400657654, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 525.25, + "completions/mean_terminated_length": 307.71429443359375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.11178749308245711, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49609375, + "kl": 0.014758320088731125, + "learning_rate": 1.1152073732718896e-05, + "loss": 0.0006, + "num_tokens": 5304880.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 169.75, + "completions/mean_terminated_length": 169.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.1119719608928242, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.03478221967816353, + "learning_rate": 1.1170506912442397e-05, + "loss": 0.0014, + "num_tokens": 5310014.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 431.5, + "completions/mean_terminated_length": 431.5, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.1121564287031913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.018478322017472237, + "learning_rate": 1.1188940092165899e-05, + "loss": 0.0007, + "num_tokens": 5323522.0, + "reward": 1.8693182468414307, + "reward_std": 0.3344077169895172, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8693181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.3344077169895172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 211.875, + "completions/mean_terminated_length": 211.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.11234089651355839, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.033673537662252784, + "learning_rate": 1.12073732718894e-05, + "loss": 0.0013, + "num_tokens": 5332121.0, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 358.5, + "completions/mean_terminated_length": 358.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.11252536432392547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.014329691766761243, + "learning_rate": 1.1225806451612904e-05, + "loss": 0.0006, + "num_tokens": 5342309.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 298.375, + "completions/mean_terminated_length": 298.375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.11270983213429256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.01646226894808933, + "learning_rate": 1.1244239631336407e-05, + "loss": 0.0007, + "num_tokens": 5350120.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 370.875, + "completions/mean_terminated_length": 370.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.11289429994465966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.01793209568131715, + "learning_rate": 1.1262672811059908e-05, + "loss": 0.0007, + "num_tokens": 5360471.0, + "reward": 1.446969747543335, + "reward_std": 0.2357022762298584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4469697177410126, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 512.0, + "completions/mean_terminated_length": 512.0, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.11307876775502675, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.020935556502081454, + "learning_rate": 1.1281105990783412e-05, + "loss": 0.0008, + "num_tokens": 5374583.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 262.25, + "completions/mean_terminated_length": 262.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.11326323556539383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.0180795278865844, + "learning_rate": 1.1299539170506913e-05, + "loss": 0.0007, + "num_tokens": 5383073.0, + "reward": 1.7674418687820435, + "reward_std": 0.4313310980796814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7674418687820435, + "rewards/fixed_code_pass_all_test_reward/std": 0.4313310980796814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 264.625, + "completions/mean_terminated_length": 264.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.11344770337576093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.01955062581691891, + "learning_rate": 1.1317972350230415e-05, + "loss": 0.0008, + "num_tokens": 5389406.0, + "reward": 1.1222527027130127, + "reward_std": 0.7924602031707764, + "rewards/fixed_code_pass_all_test_reward/mean": 0.37225276231765747, + "rewards/fixed_code_pass_all_test_reward/std": 0.44832080602645874, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 273.0, + "completions/mean_terminated_length": 273.0, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.11363217118612802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.02083478239364922, + "learning_rate": 1.1336405529953918e-05, + "loss": 0.0008, + "num_tokens": 5398158.0, + "reward": 1.49609375, + "reward_std": 0.4480356276035309, + "rewards/fixed_code_pass_all_test_reward/mean": 0.49609375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4480356276035309, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 334.75, + "completions/mean_terminated_length": 334.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.1138166389964951, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.015172100509516895, + "learning_rate": 1.1354838709677421e-05, + "loss": 0.0006, + "num_tokens": 5407732.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 537.875, + "completions/mean_terminated_length": 537.875, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.1140011068068622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.007468666532076895, + "learning_rate": 1.1373271889400923e-05, + "loss": 0.0003, + "num_tokens": 5419843.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 280.375, + "completions/mean_terminated_length": 280.375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.1141855746172293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.018508557637687773, + "learning_rate": 1.1391705069124426e-05, + "loss": 0.0007, + "num_tokens": 5429022.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 288.0, + "completions/mean_terminated_length": 288.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.11437004242759638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.03095754235982895, + "learning_rate": 1.1410138248847926e-05, + "loss": 0.0012, + "num_tokens": 5440286.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 168.125, + "completions/mean_terminated_length": 168.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.11455451023796348, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.019285981077700853, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.0008, + "num_tokens": 5444367.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.11473897804833057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.0254968018271029, + "learning_rate": 1.1447004608294932e-05, + "loss": 0.001, + "num_tokens": 5452980.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 145.25, + "completions/mean_terminated_length": 145.25, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.11492344585869765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.01032915327232331, + "learning_rate": 1.1465437788018434e-05, + "loss": 0.0004, + "num_tokens": 5456862.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 318.875, + "completions/mean_terminated_length": 318.875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.11510791366906475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.013131651096045971, + "learning_rate": 1.1483870967741937e-05, + "loss": 0.0005, + "num_tokens": 5465573.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 680.375, + "completions/mean_terminated_length": 680.375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.11529238147943184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.007369970437139273, + "learning_rate": 1.150230414746544e-05, + "loss": 0.0003, + "num_tokens": 5482096.0, + "reward": 1.189814805984497, + "reward_std": 0.18298962712287903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.18981480598449707, + "rewards/fixed_code_pass_all_test_reward/std": 0.18298962712287903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 493.875, + "completions/mean_terminated_length": 493.875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.11547684928979893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.008883859787601978, + "learning_rate": 1.152073732718894e-05, + "loss": 0.0004, + "num_tokens": 5493063.0, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.11566131710016603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1884765625, + "kl": 0.03757879603654146, + "learning_rate": 1.1539170506912443e-05, + "loss": 0.0015, + "num_tokens": 5501743.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 127.125, + "completions/mean_terminated_length": 127.125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.11584578491053311, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.25, + "kl": 0.027318429434671998, + "learning_rate": 1.1557603686635945e-05, + "loss": 0.0011, + "num_tokens": 5505400.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 632.0, + "completions/mean_terminated_length": 632.0, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "epoch": 0.1160302527209002, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.466796875, + "kl": 0.006017070059897378, + "learning_rate": 1.1576036866359448e-05, + "loss": 0.0002, + "num_tokens": 5517320.0, + "reward": 1.1458332538604736, + "reward_std": 0.058925557881593704, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.11621472053126729, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.01838120660977438, + "learning_rate": 1.1594470046082951e-05, + "loss": 0.0007, + "num_tokens": 5521774.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 432.0, + "completions/mean_terminated_length": 432.0, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.11639918834163439, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.00994636700488627, + "learning_rate": 1.1612903225806453e-05, + "loss": 0.0004, + "num_tokens": 5529886.0, + "reward": 1.798295497894287, + "reward_std": 0.04108459874987602, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7982954382896423, + "rewards/fixed_code_pass_all_test_reward/std": 0.041084595024585724, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 280.875, + "completions/mean_terminated_length": 280.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.11658365615200147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.021577075240202248, + "learning_rate": 1.1631336405529954e-05, + "loss": 0.0009, + "num_tokens": 5535989.0, + "reward": 1.6586538553237915, + "reward_std": 0.47214797139167786, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6586538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.47214800119400024, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 259.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.11676812396236856, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.011569142865482718, + "learning_rate": 1.1649769585253456e-05, + "loss": 0.0005, + "num_tokens": 5541010.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 471.25, + "completions/mean_terminated_length": 471.25, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.11695259177273566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.008614380727522075, + "learning_rate": 1.1668202764976959e-05, + "loss": 0.0003, + "num_tokens": 5553012.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 395.5, + "completions/mean_terminated_length": 395.5, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.11713705958310275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.01758048264309764, + "learning_rate": 1.1686635944700462e-05, + "loss": 0.0007, + "num_tokens": 5563400.0, + "reward": 1.7437500953674316, + "reward_std": 0.37553533911705017, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7437499761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.37553533911705017, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 526.625, + "completions/mean_terminated_length": 526.625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.11732152739346983, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.011207400704734027, + "learning_rate": 1.1705069124423964e-05, + "loss": 0.0004, + "num_tokens": 5576717.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 598.375, + "completions/mean_terminated_length": 598.375, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "epoch": 0.11750599520383694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.01113398966845125, + "learning_rate": 1.1723502304147467e-05, + "loss": 0.0004, + "num_tokens": 5590880.0, + "reward": 1.317307710647583, + "reward_std": 0.3088918924331665, + "rewards/fixed_code_pass_all_test_reward/mean": 0.317307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.3088918924331665, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 374.375, + "completions/mean_terminated_length": 374.375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.11769046301420402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.010011591337388381, + "learning_rate": 1.1741935483870967e-05, + "loss": 0.0004, + "num_tokens": 5597931.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 330.5, + "completions/mean_terminated_length": 330.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.11787493082457111, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.01768647402059287, + "learning_rate": 1.176036866359447e-05, + "loss": 0.0007, + "num_tokens": 5609623.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 329.5, + "completions/mean_terminated_length": 329.5, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.11805939863493821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.009646464277466293, + "learning_rate": 1.1778801843317973e-05, + "loss": 0.0004, + "num_tokens": 5616243.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 200.0, + "completions/mean_terminated_length": 200.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.1182438664453053, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.014067991636693478, + "learning_rate": 1.1797235023041475e-05, + "loss": 0.0006, + "num_tokens": 5620859.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 328.125, + "completions/mean_terminated_length": 328.125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.11842833425567238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.017923756560776383, + "learning_rate": 1.1815668202764978e-05, + "loss": 0.0007, + "num_tokens": 5631436.0, + "reward": 1.537500023841858, + "reward_std": 0.36620640754699707, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5375000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.36620640754699707, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 596.75, + "completions/mean_terminated_length": 596.75, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.11861280206603948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.009247386828064919, + "learning_rate": 1.1834101382488481e-05, + "loss": 0.0004, + "num_tokens": 5644466.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 256.125, + "completions/mean_terminated_length": 256.125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.11879726987640657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.01566549949347973, + "learning_rate": 1.1852534562211983e-05, + "loss": 0.0006, + "num_tokens": 5650387.0, + "reward": 1.21875, + "reward_std": 0.321014940738678, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, + "rewards/fixed_code_pass_all_test_reward/std": 0.321014940738678, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.11898173768677366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01485956716351211, + "learning_rate": 1.1870967741935484e-05, + "loss": 0.0006, + "num_tokens": 5657120.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 127.75, + "completions/mean_terminated_length": 127.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.11916620549714074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.012116140278521925, + "learning_rate": 1.1889400921658986e-05, + "loss": 0.0005, + "num_tokens": 5660918.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 276.625, + "completions/mean_terminated_length": 276.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.11935067330750784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.01627954060677439, + "learning_rate": 1.190783410138249e-05, + "loss": 0.0007, + "num_tokens": 5667259.0, + "reward": 1.1956522464752197, + "reward_std": 0.606918454170227, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32065218687057495, + "rewards/fixed_code_pass_all_test_reward/std": 0.3895318806171417, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 495.0, + "completions/mean_terminated_length": 495.0, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.11953514111787493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.009384277684148401, + "learning_rate": 1.1926267281105992e-05, + "loss": 0.0004, + "num_tokens": 5677059.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 208.125, + "completions/mean_terminated_length": 208.125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.11971960892824202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.014314852189272642, + "learning_rate": 1.1944700460829494e-05, + "loss": 0.0006, + "num_tokens": 5681436.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 390.125, + "completions/mean_terminated_length": 390.125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.11990407673860912, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.02489046868868172, + "learning_rate": 1.1963133640552997e-05, + "loss": 0.001, + "num_tokens": 5692493.0, + "reward": 1.2386363744735718, + "reward_std": 0.6297194957733154, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.40945151448249817, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 474.25, + "completions/mean_terminated_length": 474.25, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.1200885445489762, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.021713092806749046, + "learning_rate": 1.1981566820276497e-05, + "loss": 0.0009, + "num_tokens": 5701591.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 203.125, + "completions/mean_terminated_length": 203.125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.12027301235934329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.018083055620081723, + "learning_rate": 1.2e-05, + "loss": 0.0007, + "num_tokens": 5708296.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1092.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 639.5, + "completions/mean_terminated_length": 639.5, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "epoch": 0.12045748016971039, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49609375, + "kl": 0.005555703130085021, + "learning_rate": 1.2018433179723504e-05, + "loss": 0.0002, + "num_tokens": 5718860.0, + "reward": 1.9500000476837158, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.12064194798007748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.0211978608276695, + "learning_rate": 1.2036866359447005e-05, + "loss": 0.0008, + "num_tokens": 5725828.0, + "reward": 1.3624999523162842, + "reward_std": 0.2825268805027008, + "rewards/fixed_code_pass_all_test_reward/mean": 0.36250001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.2825268805027008, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 152.75, + "completions/mean_terminated_length": 152.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.12082641579044456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.007720790046732873, + "learning_rate": 1.2055299539170508e-05, + "loss": 0.0003, + "num_tokens": 5729762.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 489.125, + "completions/mean_terminated_length": 489.125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.12101088360081166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.020320967538282275, + "learning_rate": 1.2073732718894012e-05, + "loss": 0.0008, + "num_tokens": 5741347.0, + "reward": 1.15625, + "reward_std": 0.2156454473733902, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, + "rewards/fixed_code_pass_all_test_reward/std": 0.21564547717571259, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 286.75, + "completions/mean_terminated_length": 286.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.12119535141117875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.03287648502737284, + "learning_rate": 1.2092165898617511e-05, + "loss": 0.0013, + "num_tokens": 5749689.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 181.5, + "completions/mean_terminated_length": 181.5, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.12137981922154584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.02426884847227484, + "learning_rate": 1.2110599078341015e-05, + "loss": 0.001, + "num_tokens": 5755773.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 470.5, + "completions/mean_terminated_length": 470.5, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.12156428703191294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0247802734375, + "kl": 0.007138237240724266, + "learning_rate": 1.2129032258064518e-05, + "loss": 0.0003, + "num_tokens": 5764649.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 338.875, + "completions/mean_terminated_length": 338.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.12174875484228002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.01998111023567617, + "learning_rate": 1.214746543778802e-05, + "loss": 0.0008, + "num_tokens": 5773280.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 238.75, + "completions/mean_terminated_length": 238.75, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.12193322265264711, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.021241999347694218, + "learning_rate": 1.2165898617511523e-05, + "loss": 0.0008, + "num_tokens": 5780102.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.1221176904630142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.02732308954000473, + "learning_rate": 1.2184331797235026e-05, + "loss": 0.0011, + "num_tokens": 5789164.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 297.875, + "completions/mean_terminated_length": 297.875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.1223021582733813, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.011022712336853147, + "learning_rate": 1.2202764976958526e-05, + "loss": 0.0004, + "num_tokens": 5796059.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 399.125, + "completions/mean_terminated_length": 399.125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.12248662608374838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.020758160739205778, + "learning_rate": 1.2221198156682029e-05, + "loss": 0.0008, + "num_tokens": 5806644.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 409.0, + "completions/mean_terminated_length": 409.0, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.12267109389411547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.016644110437482595, + "learning_rate": 1.223963133640553e-05, + "loss": 0.0007, + "num_tokens": 5818980.0, + "reward": 1.2419354915618896, + "reward_std": 0.3325643837451935, + "rewards/fixed_code_pass_all_test_reward/mean": 0.24193547666072845, + "rewards/fixed_code_pass_all_test_reward/std": 0.3325643837451935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 229.375, + "completions/mean_terminated_length": 229.375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.12285556170448257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.012655959464609623, + "learning_rate": 1.2258064516129034e-05, + "loss": 0.0005, + "num_tokens": 5825847.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 122.625, + "completions/mean_terminated_length": 122.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.12304002951484966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.333984375, + "kl": 0.04878741386346519, + "learning_rate": 1.2276497695852537e-05, + "loss": 0.002, + "num_tokens": 5829652.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 219.0, + "completions/mean_terminated_length": 219.0, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.12322449732521674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.00492924073478207, + "learning_rate": 1.2294930875576038e-05, + "loss": 0.0002, + "num_tokens": 5834812.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 275.125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.12340896513558385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.016838991665281355, + "learning_rate": 1.231336405529954e-05, + "loss": 0.0007, + "num_tokens": 5840693.0, + "reward": 1.4666666984558105, + "reward_std": 0.33806174993515015, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.33806169033050537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 105.625, + "completions/mean_terminated_length": 105.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.12359343294595093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.010949868243187666, + "learning_rate": 1.2331797235023041e-05, + "loss": 0.0004, + "num_tokens": 5844234.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 584.5, + "completions/mean_terminated_length": 584.5, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.12377790075631802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.009572273876983672, + "learning_rate": 1.2350230414746545e-05, + "loss": 0.0004, + "num_tokens": 5854638.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 337.75, + "completions/mean_terminated_length": 337.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.12396236856668512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.0073857964016497135, + "learning_rate": 1.2368663594470048e-05, + "loss": 0.0003, + "num_tokens": 5861388.0, + "reward": 1.6944444179534912, + "reward_std": 0.5134865045547485, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8194444179534912, + "rewards/fixed_code_pass_all_test_reward/std": 0.25153848528862, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 480.375, + "completions/mean_terminated_length": 480.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.1241468363770522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.012636109720915556, + "learning_rate": 1.238709677419355e-05, + "loss": 0.0005, + "num_tokens": 5870871.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 263.125, + "completions/mean_terminated_length": 263.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.12433130418741929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.016431959345936775, + "learning_rate": 1.2405529953917053e-05, + "loss": 0.0007, + "num_tokens": 5877088.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 500.5, + "completions/mean_terminated_length": 500.5, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.12451577199778639, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.011068681604228914, + "learning_rate": 1.2423963133640553e-05, + "loss": 0.0004, + "num_tokens": 5885404.0, + "reward": 1.6785714626312256, + "reward_std": 0.27027443051338196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714030265808, + "rewards/fixed_code_pass_all_test_reward/std": 0.27027443051338196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 325.5, + "completions/mean_terminated_length": 325.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.12470023980815348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.01131491910200566, + "learning_rate": 1.2442396313364056e-05, + "loss": 0.0005, + "num_tokens": 5894584.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 253.875, + "completions/mean_terminated_length": 253.875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.12488470761852057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.006685057916911319, + "learning_rate": 1.2460829493087559e-05, + "loss": 0.0003, + "num_tokens": 5900255.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 319.75, + "completions/mean_terminated_length": 319.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.12506917542888765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24609375, + "kl": 0.02956463862210512, + "learning_rate": 1.247926267281106e-05, + "loss": 0.0012, + "num_tokens": 5908749.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 266.5, + "completions/mean_terminated_length": 266.5, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.12525364323925475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.008470706292428076, + "learning_rate": 1.2497695852534564e-05, + "loss": 0.0003, + "num_tokens": 5914297.0, + "reward": 1.7999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 295.875, + "completions/mean_terminated_length": 295.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.12543811104962185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.01003311324166134, + "learning_rate": 1.2516129032258067e-05, + "loss": 0.0004, + "num_tokens": 5922688.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 240.5, + "completions/mean_terminated_length": 240.5, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.12562257885998893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.01014643389498815, + "learning_rate": 1.2534562211981567e-05, + "loss": 0.0004, + "num_tokens": 5927764.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 351.625, + "completions/mean_terminated_length": 351.625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.12580704667035603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.014334492763737217, + "learning_rate": 1.255299539170507e-05, + "loss": 0.0006, + "num_tokens": 5933401.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 969.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 514.125, + "completions/mean_terminated_length": 514.125, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.12599151448072313, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.009524707915261388, + "learning_rate": 1.2571428571428572e-05, + "loss": 0.0004, + "num_tokens": 5946890.0, + "reward": 0.8981481194496155, + "reward_std": 0.3484238386154175, + "rewards/fixed_code_pass_all_test_reward/mean": 0.023148149251937866, + "rewards/fixed_code_pass_all_test_reward/std": 0.019168486818671227, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 283.625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.1261759822910902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.013763547642156482, + "learning_rate": 1.2589861751152075e-05, + "loss": 0.0006, + "num_tokens": 5954399.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 317.375, + "completions/mean_terminated_length": 317.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.1263604501014573, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.00900739652570337, + "learning_rate": 1.2608294930875578e-05, + "loss": 0.0004, + "num_tokens": 5962546.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 402.875, + "completions/mean_terminated_length": 402.875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.12654491791182437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038330078125, + "kl": 0.012674597732257098, + "learning_rate": 1.262672811059908e-05, + "loss": 0.0005, + "num_tokens": 5970577.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 343.375, + "completions/mean_terminated_length": 343.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.12672938572219147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.01283425884321332, + "learning_rate": 1.2645161290322581e-05, + "loss": 0.0005, + "num_tokens": 5978612.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.12691385353255857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.009311013913247734, + "learning_rate": 1.2663594470046083e-05, + "loss": 0.0004, + "num_tokens": 5983202.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 581.5, + "completions/mean_terminated_length": 372.0000305175781, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.12709832134292565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.625, + "kl": 0.017149349150713533, + "learning_rate": 1.2682027649769586e-05, + "loss": 0.0007, + "num_tokens": 5997262.0, + "reward": 1.4905303716659546, + "reward_std": 0.6186563372612, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6155303120613098, + "rewards/fixed_code_pass_all_test_reward/std": 0.28612905740737915, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 284.75, + "completions/mean_terminated_length": 284.75, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.12728278915329275, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.022962539340369403, + "learning_rate": 1.2700460829493089e-05, + "loss": 0.0009, + "num_tokens": 6004764.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 436.25, + "completions/mean_terminated_length": 436.25, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.12746725696365985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.009973986627301201, + "learning_rate": 1.271889400921659e-05, + "loss": 0.0004, + "num_tokens": 6013638.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.12765172477402692, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.024273328890558332, + "learning_rate": 1.2737327188940094e-05, + "loss": 0.001, + "num_tokens": 6017529.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.12783619258439402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.014770885405596346, + "learning_rate": 1.2755760368663594e-05, + "loss": 0.0006, + "num_tokens": 6022373.0, + "reward": 1.9375, + "reward_std": 0.1157275140285492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 148.875, + "completions/mean_terminated_length": 148.875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.12802066039476112, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.056296321912668645, + "learning_rate": 1.2774193548387097e-05, + "loss": 0.0023, + "num_tokens": 6030612.0, + "reward": 1.6052632331848145, + "reward_std": 0.3740176558494568, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6052631139755249, + "rewards/fixed_code_pass_all_test_reward/std": 0.3740176260471344, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 457.625, + "completions/mean_terminated_length": 457.625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.1282051282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.013139610411599278, + "learning_rate": 1.27926267281106e-05, + "loss": 0.0005, + "num_tokens": 6041921.0, + "reward": 1.4964789152145386, + "reward_std": 0.3884108066558838, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6214789152145386, + "rewards/fixed_code_pass_all_test_reward/std": 0.034857384860515594, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.1283895960154953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.016443020664155483, + "learning_rate": 1.2811059907834102e-05, + "loss": 0.0007, + "num_tokens": 6048047.0, + "reward": 1.01260507106781, + "reward_std": 0.03565245121717453, + "rewards/fixed_code_pass_all_test_reward/mean": 0.012605042196810246, + "rewards/fixed_code_pass_all_test_reward/std": 0.03565244376659393, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 410.125, + "completions/mean_terminated_length": 410.125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.1285740638258624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69921875, + "kl": 0.009359834366478026, + "learning_rate": 1.2829493087557605e-05, + "loss": 0.0004, + "num_tokens": 6057568.0, + "reward": 1.625, + "reward_std": 0.11785116046667099, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1178511381149292, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 494.125, + "completions/mean_terminated_length": 494.125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.12875853163622947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.008197460934752598, + "learning_rate": 1.2847926267281108e-05, + "loss": 0.0003, + "num_tokens": 6067049.0, + "reward": 1.7675000429153442, + "reward_std": 0.09498120844364166, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7675000429153442, + "rewards/fixed_code_pass_all_test_reward/std": 0.09498120844364166, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 151.0, + "completions/mean_terminated_length": 151.0, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.12894299944659657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.03380183281842619, + "learning_rate": 1.2866359447004608e-05, + "loss": 0.0014, + "num_tokens": 6071025.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 246.25, + "completions/mean_terminated_length": 246.25, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.12912746725696367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.81640625, + "kl": 0.06187940342351794, + "learning_rate": 1.2884792626728111e-05, + "loss": 0.0025, + "num_tokens": 6080171.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.12931193506733074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.0343069777591154, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.0014, + "num_tokens": 6088596.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 407.125, + "completions/mean_terminated_length": 407.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.12949640287769784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.01815927936695516, + "learning_rate": 1.2921658986175116e-05, + "loss": 0.0007, + "num_tokens": 6096477.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 259.375, + "completions/mean_terminated_length": 259.375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.12968087068806494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.025725731742568314, + "learning_rate": 1.294009216589862e-05, + "loss": 0.001, + "num_tokens": 6102656.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 364.375, + "completions/mean_terminated_length": 364.375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.12986533849843201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.018579981464426965, + "learning_rate": 1.2958525345622122e-05, + "loss": 0.0007, + "num_tokens": 6113211.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.13004980630879912, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.026171888457611203, + "learning_rate": 1.2976958525345624e-05, + "loss": 0.001, + "num_tokens": 6122124.0, + "reward": 1.0299999713897705, + "reward_std": 0.08485280722379684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.029999999329447746, + "rewards/fixed_code_pass_all_test_reward/std": 0.08485281467437744, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 251.5, + "completions/mean_terminated_length": 251.5, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.13023427411916622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.03152118972502649, + "learning_rate": 1.2995391705069126e-05, + "loss": 0.0013, + "num_tokens": 6130544.0, + "reward": 1.3571429252624512, + "reward_std": 0.4501376152038574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.2645200192928314, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 350.25, + "completions/mean_terminated_length": 350.25, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.1304187419295333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.018276812275871634, + "learning_rate": 1.3013824884792627e-05, + "loss": 0.0007, + "num_tokens": 6142418.0, + "reward": 1.78125, + "reward_std": 0.3582572042942047, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3582572042942047, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 190.0, + "completions/mean_terminated_length": 190.0, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1306032097399004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1875, + "kl": 0.027830975246615708, + "learning_rate": 1.303225806451613e-05, + "loss": 0.0011, + "num_tokens": 6148754.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 757.625, + "completions/mean_terminated_length": 573.2857666015625, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.1307876775502675, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.255859375, + "kl": 0.005082618095912039, + "learning_rate": 1.3050691244239634e-05, + "loss": 0.0002, + "num_tokens": 6161671.0, + "reward": 1.0208332538604736, + "reward_std": 0.4124789237976074, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 397.0, + "completions/mean_terminated_length": 397.0, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.13097214536063456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.01624119805637747, + "learning_rate": 1.3069124423963135e-05, + "loss": 0.0006, + "num_tokens": 6169303.0, + "reward": 1.5988805294036865, + "reward_std": 0.6742870211601257, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7238805890083313, + "rewards/fixed_code_pass_all_test_reward/std": 0.35048529505729675, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 346.75, + "completions/mean_terminated_length": 346.75, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.13115661317100166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13671875, + "kl": 0.03460461529903114, + "learning_rate": 1.3087557603686638e-05, + "loss": 0.0014, + "num_tokens": 6178869.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.13134108098136876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.021799714537337422, + "learning_rate": 1.3105990783410138e-05, + "loss": 0.0009, + "num_tokens": 6189030.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 619.75, + "completions/mean_terminated_length": 619.75, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.13152554879173584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.016654736478812993, + "learning_rate": 1.3124423963133641e-05, + "loss": 0.0007, + "num_tokens": 6201876.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.13171001660210294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.018368340213783085, + "learning_rate": 1.3142857142857145e-05, + "loss": 0.0007, + "num_tokens": 6206629.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 218.5, + "completions/mean_terminated_length": 218.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.13189448441247004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.019721908261999488, + "learning_rate": 1.3161290322580646e-05, + "loss": 0.0008, + "num_tokens": 6213073.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 451.75, + "completions/mean_terminated_length": 451.75, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.1320789522228371, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.02149338141316548, + "learning_rate": 1.317972350230415e-05, + "loss": 0.0009, + "num_tokens": 6224487.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 891.0, + "completions/mean_terminated_length": 505.3333435058594, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.1322634200332042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.53515625, + "kl": 0.015899461170192808, + "learning_rate": 1.3198156682027653e-05, + "loss": 0.0006, + "num_tokens": 6237023.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 525.0, + "completions/mean_terminated_length": 525.0, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.1324478878435713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.012928415031637996, + "learning_rate": 1.3216589861751152e-05, + "loss": 0.0005, + "num_tokens": 6246591.0, + "reward": 1.6623376607894897, + "reward_std": 0.17465344071388245, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6623376607894897, + "rewards/fixed_code_pass_all_test_reward/std": 0.17465342581272125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 607.25, + "completions/mean_terminated_length": 607.25, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.13263235565393838, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.0160870713298209, + "learning_rate": 1.3235023041474656e-05, + "loss": 0.0006, + "num_tokens": 6257681.0, + "reward": 1.5833333730697632, + "reward_std": 0.1543033868074417, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 357.0, + "completions/mean_terminated_length": 357.0, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.13281682346430548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.030950271990150213, + "learning_rate": 1.3253456221198157e-05, + "loss": 0.0012, + "num_tokens": 6267121.0, + "reward": 1.1363636255264282, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 256.625, + "completions/mean_terminated_length": 256.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.13300129127467256, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.013472165039274842, + "learning_rate": 1.327188940092166e-05, + "loss": 0.0005, + "num_tokens": 6272134.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.13318575908503966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.03463879611808807, + "learning_rate": 1.3290322580645164e-05, + "loss": 0.0014, + "num_tokens": 6281695.0, + "reward": 1.8046875, + "reward_std": 0.36164847016334534, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8046875, + "rewards/fixed_code_pass_all_test_reward/std": 0.36164847016334534, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 411.75, + "completions/mean_terminated_length": 411.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.13337022689540676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.01731483032926917, + "learning_rate": 1.3308755760368665e-05, + "loss": 0.0007, + "num_tokens": 6291781.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 127.25, + "completions/mean_terminated_length": 127.25, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.13355469470577383, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.032740039052441716, + "learning_rate": 1.3327188940092167e-05, + "loss": 0.0013, + "num_tokens": 6295615.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 229.5, + "completions/mean_terminated_length": 229.5, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.13373916251614093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.038838914944790304, + "learning_rate": 1.3345622119815668e-05, + "loss": 0.0016, + "num_tokens": 6300739.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 529.125, + "completions/mean_terminated_length": 529.125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.13392363032650803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.015299848339054734, + "learning_rate": 1.3364055299539171e-05, + "loss": 0.0006, + "num_tokens": 6311716.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 421.125, + "completions/mean_terminated_length": 421.125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.1341080981368751, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.0211512305540964, + "learning_rate": 1.3382488479262675e-05, + "loss": 0.0008, + "num_tokens": 6322285.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 214.0, + "completions/mean_terminated_length": 214.0, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.1342925659472422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.026305558742024004, + "learning_rate": 1.3400921658986176e-05, + "loss": 0.0011, + "num_tokens": 6333773.0, + "reward": 1.322115421295166, + "reward_std": 0.3137706220149994, + "rewards/fixed_code_pass_all_test_reward/mean": 0.322115421295166, + "rewards/fixed_code_pass_all_test_reward/std": 0.31377068161964417, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 423.125, + "completions/mean_terminated_length": 423.125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1344770337576093, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.030887339846231043, + "learning_rate": 1.341935483870968e-05, + "loss": 0.0012, + "num_tokens": 6344086.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 305.625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.13466150156797638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.02194003330077976, + "learning_rate": 1.343778801843318e-05, + "loss": 0.0009, + "num_tokens": 6353355.0, + "reward": 1.5646929740905762, + "reward_std": 0.1643713414669037, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5646929740905762, + "rewards/fixed_code_pass_all_test_reward/std": 0.1643713265657425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 190.875, + "completions/mean_terminated_length": 190.875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.13484596937834348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.022425944334827363, + "learning_rate": 1.3456221198156683e-05, + "loss": 0.0009, + "num_tokens": 6357882.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.13503043718871058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.027519751572981477, + "learning_rate": 1.3474654377880186e-05, + "loss": 0.0011, + "num_tokens": 6366934.0, + "reward": 1.946969747543335, + "reward_std": 0.14999237656593323, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9469696879386902, + "rewards/fixed_code_pass_all_test_reward/std": 0.14999234676361084, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 349.5, + "completions/mean_terminated_length": 349.5, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.13521490499907765, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.031093080062419176, + "learning_rate": 1.3493087557603687e-05, + "loss": 0.0012, + "num_tokens": 6375602.0, + "reward": 1.4375, + "reward_std": 0.6781013607978821, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1303.0, + "completions/max_terminated_length": 1303.0, + "completions/mean_length": 580.25, + "completions/mean_terminated_length": 580.25, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.13539937280944475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.04468227177858353, + "learning_rate": 1.351152073732719e-05, + "loss": 0.0018, + "num_tokens": 6392132.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 188.625, + "completions/mean_terminated_length": 188.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.13558384061981185, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.024280450074002147, + "learning_rate": 1.3529953917050694e-05, + "loss": 0.001, + "num_tokens": 6398625.0, + "reward": 1.1458332538604736, + "reward_std": 0.03857579827308655, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.03857583925127983, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 286.0, + "completions/mean_terminated_length": 286.0, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.13576830843017892, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.026149121462367475, + "learning_rate": 1.3548387096774194e-05, + "loss": 0.001, + "num_tokens": 6406457.0, + "reward": 1.9358108043670654, + "reward_std": 0.18155446648597717, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9358108043670654, + "rewards/fixed_code_pass_all_test_reward/std": 0.18155445158481598, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 338.125, + "completions/mean_terminated_length": 338.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.13595277624054602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.03232508595101535, + "learning_rate": 1.3566820276497697e-05, + "loss": 0.0013, + "num_tokens": 6415186.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 766.25, + "completions/mean_terminated_length": 766.25, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.13613724405091313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.015500634792260826, + "learning_rate": 1.3585253456221198e-05, + "loss": 0.0006, + "num_tokens": 6431556.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 391.5, + "completions/mean_terminated_length": 391.5, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.1363217118612802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.022174455225467682, + "learning_rate": 1.3603686635944702e-05, + "loss": 0.0009, + "num_tokens": 6439840.0, + "reward": 1.640625, + "reward_std": 0.4288038909435272, + "rewards/fixed_code_pass_all_test_reward/mean": 0.640625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4288038909435272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 355.625, + "completions/mean_terminated_length": 355.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.1365061796716473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.014136144774965942, + "learning_rate": 1.3622119815668205e-05, + "loss": 0.0006, + "num_tokens": 6447061.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 379.0, + "completions/mean_terminated_length": 379.0, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.1366906474820144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.022066552191972733, + "learning_rate": 1.3640552995391706e-05, + "loss": 0.0009, + "num_tokens": 6456221.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 542.5, + "completions/mean_terminated_length": 542.5, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.13687511529238147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.014162875013425946, + "learning_rate": 1.3658986175115208e-05, + "loss": 0.0006, + "num_tokens": 6470641.0, + "reward": 1.8977272510528564, + "reward_std": 0.14114977419376373, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8977272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.14114975929260254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 257.5, + "completions/mean_terminated_length": 257.5, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.13705958310274857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.020656749606132507, + "learning_rate": 1.367741935483871e-05, + "loss": 0.0008, + "num_tokens": 6476501.0, + "reward": 1.2857142686843872, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 492.125, + "completions/mean_terminated_length": 492.125, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.13724405091311567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.011496823804918677, + "learning_rate": 1.3695852534562213e-05, + "loss": 0.0005, + "num_tokens": 6485814.0, + "reward": 1.2678570747375488, + "reward_std": 0.08321178704500198, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.08321177214384079, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 194.125, + "completions/mean_terminated_length": 194.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.13742851872348275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.02410691638942808, + "learning_rate": 1.3714285714285716e-05, + "loss": 0.001, + "num_tokens": 6490399.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 405.25, + "completions/mean_terminated_length": 405.25, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.13761298653384985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.026261798106133938, + "learning_rate": 1.3732718894009217e-05, + "loss": 0.0011, + "num_tokens": 6502161.0, + "reward": 1.4375, + "reward_std": 0.3922051787376404, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.39220529794692993, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 181.875, + "completions/mean_terminated_length": 181.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.13779745434421695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.02647591312415898, + "learning_rate": 1.375115207373272e-05, + "loss": 0.0011, + "num_tokens": 6506800.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 214.5, + "completions/mean_terminated_length": 214.5, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.13798192215458402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0380859375, + "kl": 0.013860256178304553, + "learning_rate": 1.3769585253456222e-05, + "loss": 0.0006, + "num_tokens": 6512116.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 395.25, + "completions/mean_terminated_length": 395.25, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.13816638996495112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.01572678005322814, + "learning_rate": 1.3788018433179724e-05, + "loss": 0.0006, + "num_tokens": 6521822.0, + "reward": 1.8333332538604736, + "reward_std": 0.32120805978775024, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.32120802998542786, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 284.0, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.13835085777531822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.03260247875005007, + "learning_rate": 1.3806451612903227e-05, + "loss": 0.0013, + "num_tokens": 6530222.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1165.0, + "completions/mean_length": 973.125, + "completions/mean_terminated_length": 819.5714721679688, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "epoch": 0.1385353255856853, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5, + "kl": 0.010894590872339904, + "learning_rate": 1.382488479262673e-05, + "loss": 0.0004, + "num_tokens": 6548431.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 473.75, + "completions/mean_terminated_length": 473.75, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.1387197933960524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.016499250603374094, + "learning_rate": 1.3843317972350232e-05, + "loss": 0.0007, + "num_tokens": 6562293.0, + "reward": 1.6306817531585693, + "reward_std": 0.2553298771381378, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6306818723678589, + "rewards/fixed_code_pass_all_test_reward/std": 0.25532984733581543, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 411.875, + "completions/mean_terminated_length": 411.875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.13890426120641947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.028235030127689242, + "learning_rate": 1.3861751152073735e-05, + "loss": 0.0011, + "num_tokens": 6572300.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 281.0, + "completions/mean_terminated_length": 281.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.13908872901678657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.03183341957628727, + "learning_rate": 1.3880184331797235e-05, + "loss": 0.0013, + "num_tokens": 6580468.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 457.125, + "completions/mean_terminated_length": 457.125, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.13927319682715367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.01674209046177566, + "learning_rate": 1.3898617511520738e-05, + "loss": 0.0007, + "num_tokens": 6589141.0, + "reward": 1.1875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 174.0, + "completions/mean_terminated_length": 174.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.13945766463752074, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.028881902107968926, + "learning_rate": 1.3917050691244241e-05, + "loss": 0.0012, + "num_tokens": 6593445.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 119.25, + "completions/mean_terminated_length": 119.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.13964213244788784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.03357564425095916, + "learning_rate": 1.3935483870967743e-05, + "loss": 0.0013, + "num_tokens": 6597383.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 472.0, + "completions/mean_terminated_length": 472.0, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.13982660025825494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.021979342098347843, + "learning_rate": 1.3953917050691246e-05, + "loss": 0.0009, + "num_tokens": 6608855.0, + "reward": 1.7644230127334595, + "reward_std": 0.0951874852180481, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7644230127334595, + "rewards/fixed_code_pass_all_test_reward/std": 0.0951874628663063, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 223.625, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.140011068068622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.02389827778097242, + "learning_rate": 1.397235023041475e-05, + "loss": 0.001, + "num_tokens": 6614580.0, + "reward": 1.3974056243896484, + "reward_std": 0.12159383296966553, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3974056839942932, + "rewards/fixed_code_pass_all_test_reward/std": 0.12159384787082672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 80.25, + "completions/mean_terminated_length": 80.25, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1401955358789891, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7578125, + "kl": 0.07828154135495424, + "learning_rate": 1.3990783410138249e-05, + "loss": 0.0031, + "num_tokens": 6618038.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 367.5, + "completions/mean_terminated_length": 367.5, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.14038000368935621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.008564533665776253, + "learning_rate": 1.4009216589861752e-05, + "loss": 0.0003, + "num_tokens": 6625562.0, + "reward": 1.8333332538604736, + "reward_std": 0.34503278136253357, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.07715168595314026, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 562.125, + "completions/mean_terminated_length": 562.125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.1405644714997233, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83984375, + "kl": 0.015049646375700831, + "learning_rate": 1.4027649769585254e-05, + "loss": 0.0006, + "num_tokens": 6639067.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 341.5, + "completions/mean_terminated_length": 341.5, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.1407489393100904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2099609375, + "kl": 0.027179012540727854, + "learning_rate": 1.4046082949308757e-05, + "loss": 0.0011, + "num_tokens": 6649743.0, + "reward": 1.0909091234207153, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.09090909361839294, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 191.75, + "completions/mean_terminated_length": 191.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.1409334071204575, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.03725534223485738, + "learning_rate": 1.406451612903226e-05, + "loss": 0.0015, + "num_tokens": 6654285.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1090.0, + "completions/max_terminated_length": 1090.0, + "completions/mean_length": 541.0, + "completions/mean_terminated_length": 541.0, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.14111787493082456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.015308193862438202, + "learning_rate": 1.4082949308755762e-05, + "loss": 0.0006, + "num_tokens": 6666349.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 471.5, + "completions/mean_terminated_length": 471.5, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.14130234274119166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.009917626099195331, + "learning_rate": 1.4101382488479263e-05, + "loss": 0.0004, + "num_tokens": 6675841.0, + "reward": 1.6666667461395264, + "reward_std": 0.46004366874694824, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.460043728351593, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 523.875, + "completions/mean_terminated_length": 306.14288330078125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.14148681055155876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48828125, + "kl": 0.009823341250012163, + "learning_rate": 1.4119815668202765e-05, + "loss": 0.0004, + "num_tokens": 6686856.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 290.625, + "completions/mean_terminated_length": 290.625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.14167127836192583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.294921875, + "kl": 0.030536632519215345, + "learning_rate": 1.4138248847926268e-05, + "loss": 0.0012, + "num_tokens": 6693845.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 138.0, + "completions/mean_terminated_length": 138.0, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.14185574617229293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.0308961751870811, + "learning_rate": 1.4156682027649771e-05, + "loss": 0.0012, + "num_tokens": 6699701.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 257.0, + "completions/mean_terminated_length": 257.0, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.14204021398266004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.01216026209294796, + "learning_rate": 1.4175115207373273e-05, + "loss": 0.0005, + "num_tokens": 6705565.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 590.375, + "completions/mean_terminated_length": 590.375, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.1422246817930271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.008934514684369788, + "learning_rate": 1.4193548387096776e-05, + "loss": 0.0004, + "num_tokens": 6716352.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 342.5, + "completions/mean_terminated_length": 342.5, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.1424091496033942, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.01670481136534363, + "learning_rate": 1.421198156682028e-05, + "loss": 0.0007, + "num_tokens": 6723852.0, + "reward": 1.5833333730697632, + "reward_std": 0.2357023060321808, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.1425936174137613, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.033803693018853664, + "learning_rate": 1.4230414746543779e-05, + "loss": 0.0014, + "num_tokens": 6728408.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 131.625, + "completions/mean_terminated_length": 131.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.14277808522412838, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.059999131597578526, + "learning_rate": 1.4248847926267282e-05, + "loss": 0.0024, + "num_tokens": 6732277.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 374.25, + "completions/mean_terminated_length": 374.25, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.14296255303449548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.01742211007513106, + "learning_rate": 1.4267281105990784e-05, + "loss": 0.0007, + "num_tokens": 6742151.0, + "reward": 1.9402778148651123, + "reward_std": 0.08397896587848663, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9402777552604675, + "rewards/fixed_code_pass_all_test_reward/std": 0.08397898077964783, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 308.25, + "completions/mean_terminated_length": 308.25, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.14314702084486258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.03779052116442472, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.0015, + "num_tokens": 6750569.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.14333148865522966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.05229167826473713, + "learning_rate": 1.430414746543779e-05, + "loss": 0.0021, + "num_tokens": 6758814.0, + "reward": 1.2083332538604736, + "reward_std": 0.28752732276916504, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.2875273525714874, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 335.75, + "completions/mean_terminated_length": 335.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.14351595646559676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.017067412962205708, + "learning_rate": 1.4322580645161292e-05, + "loss": 0.0007, + "num_tokens": 6768372.0, + "reward": 1.777298927307129, + "reward_std": 0.01219146978110075, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7772988080978394, + "rewards/fixed_code_pass_all_test_reward/std": 0.012191482819616795, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 391.5, + "completions/mean_terminated_length": 391.5, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.14370042427596386, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.35546875, + "kl": 0.029603343922644854, + "learning_rate": 1.4341013824884793e-05, + "loss": 0.0012, + "num_tokens": 6776480.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 233.5, + "completions/mean_terminated_length": 233.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.14388489208633093, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.027577376225963235, + "learning_rate": 1.4359447004608295e-05, + "loss": 0.0011, + "num_tokens": 6782116.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 278.625, + "completions/mean_terminated_length": 278.625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.14406935989669803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.045364767545834184, + "learning_rate": 1.4377880184331798e-05, + "loss": 0.0018, + "num_tokens": 6791033.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 277.125, + "completions/mean_terminated_length": 277.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.14425382770706513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.019597986130975187, + "learning_rate": 1.4396313364055301e-05, + "loss": 0.0008, + "num_tokens": 6799026.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 333.5, + "completions/mean_terminated_length": 333.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.1444382955174322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.029411056311801076, + "learning_rate": 1.4414746543778803e-05, + "loss": 0.0012, + "num_tokens": 6807654.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.1446227633277993, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.03787960729096085, + "learning_rate": 1.4433179723502306e-05, + "loss": 0.0015, + "num_tokens": 6815053.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 342.125, + "completions/mean_terminated_length": 342.125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.1448072311381664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.029287808923982084, + "learning_rate": 1.4451612903225806e-05, + "loss": 0.0012, + "num_tokens": 6825862.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.14499169894853348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.02604480367153883, + "learning_rate": 1.447004608294931e-05, + "loss": 0.001, + "num_tokens": 6833291.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 293.875, + "completions/mean_terminated_length": 293.875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.14517616675890058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.029392108554020524, + "learning_rate": 1.4488479262672812e-05, + "loss": 0.0012, + "num_tokens": 6839866.0, + "reward": 1.1339285373687744, + "reward_std": 0.8333758115768433, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3839285671710968, + "rewards/fixed_code_pass_all_test_reward/std": 0.510726809501648, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 115.75, + "completions/mean_terminated_length": 115.75, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.14536063456926765, + "frac_reward_zero_std": 1.0, + "grad_norm": 15.5625, + "kl": 0.7103681610897183, + "learning_rate": 1.4506912442396314e-05, + "loss": 0.0284, + "num_tokens": 6843656.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 664.375, + "completions/mean_terminated_length": 664.375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.14554510237963475, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.0, + "kl": 0.01622362481430173, + "learning_rate": 1.4525345622119817e-05, + "loss": 0.0006, + "num_tokens": 6854675.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 346.125, + "completions/mean_terminated_length": 346.125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.14572957019000185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2451171875, + "kl": 0.018583374738227576, + "learning_rate": 1.454377880184332e-05, + "loss": 0.0007, + "num_tokens": 6861820.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 144.75, + "completions/mean_terminated_length": 144.75, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.14591403800036892, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.03317134128883481, + "learning_rate": 1.456221198156682e-05, + "loss": 0.0013, + "num_tokens": 6865850.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 570.125, + "completions/mean_terminated_length": 359.0000305175781, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.14609850581073602, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.015854478522669524, + "learning_rate": 1.4580645161290324e-05, + "loss": 0.0006, + "num_tokens": 6877371.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 318.375, + "completions/mean_terminated_length": 318.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.14628297362110312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.016353400889784098, + "learning_rate": 1.4599078341013827e-05, + "loss": 0.0007, + "num_tokens": 6884726.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 260.0, + "completions/mean_terminated_length": 260.0, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.1464674414314702, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.014562718919478357, + "learning_rate": 1.4617511520737328e-05, + "loss": 0.0006, + "num_tokens": 6890822.0, + "reward": 1.75, + "reward_std": 0.3505098223686218, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.3505098521709442, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 137.375, + "completions/mean_terminated_length": 137.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1466519092418373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.185546875, + "kl": 0.0419701935024932, + "learning_rate": 1.4635944700460832e-05, + "loss": 0.0017, + "num_tokens": 6894897.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 281.875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.1468363770522044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.023721805890090764, + "learning_rate": 1.4654377880184335e-05, + "loss": 0.0009, + "num_tokens": 6901224.0, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 317.0, + "completions/mean_terminated_length": 317.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.14702084486257147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.014760594698600471, + "learning_rate": 1.4672811059907835e-05, + "loss": 0.0006, + "num_tokens": 6909312.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 321.75, + "completions/mean_terminated_length": 321.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.14720531267293857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.025285649637226015, + "learning_rate": 1.4691244239631338e-05, + "loss": 0.001, + "num_tokens": 6916214.0, + "reward": 1.4318182468414307, + "reward_std": 0.2780858874320984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4318181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.2780858874320984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 227.375, + "completions/mean_terminated_length": 227.375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.14738978048330567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.01802999951178208, + "learning_rate": 1.470967741935484e-05, + "loss": 0.0007, + "num_tokens": 6920849.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 331.0, + "completions/mean_terminated_length": 331.0, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.14757424829367274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.020955910498742014, + "learning_rate": 1.4728110599078343e-05, + "loss": 0.0008, + "num_tokens": 6929161.0, + "reward": 1.4772727489471436, + "reward_std": 0.2945791482925415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.47727271914482117, + "rewards/fixed_code_pass_all_test_reward/std": 0.2945791482925415, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 159.875, + "completions/mean_terminated_length": 159.875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.14775871610403984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.02193617168813944, + "learning_rate": 1.4746543778801846e-05, + "loss": 0.0009, + "num_tokens": 6933296.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 350.125, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.14794318391440694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.03128149639815092, + "learning_rate": 1.4764976958525347e-05, + "loss": 0.0013, + "num_tokens": 6944145.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 347.0, + "completions/mean_terminated_length": 347.0, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.14812765172477402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.015381735400296748, + "learning_rate": 1.4783410138248849e-05, + "loss": 0.0006, + "num_tokens": 6950793.0, + "reward": 1.759615421295166, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8846153616905212, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 390.5, + "completions/mean_terminated_length": 390.5, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.14831211953514112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.015157002257183194, + "learning_rate": 1.480184331797235e-05, + "loss": 0.0006, + "num_tokens": 6958445.0, + "reward": 1.4285714626312256, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4285714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.14849658734550822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.023545119212940335, + "learning_rate": 1.4820276497695854e-05, + "loss": 0.0009, + "num_tokens": 6966449.0, + "reward": 1.0272727012634277, + "reward_std": 0.04007076844573021, + "rewards/fixed_code_pass_all_test_reward/mean": 0.027272727340459824, + "rewards/fixed_code_pass_all_test_reward/std": 0.040070775896310806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 210.25, + "completions/mean_terminated_length": 210.25, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.1486810551558753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.01722046302165836, + "learning_rate": 1.4838709677419357e-05, + "loss": 0.0007, + "num_tokens": 6970979.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 448.875, + "completions/mean_terminated_length": 448.875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.1488655229662424, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.03055403829785064, + "learning_rate": 1.4857142857142858e-05, + "loss": 0.0012, + "num_tokens": 6979674.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 376.375, + "completions/mean_terminated_length": 376.375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.1490499907766095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.02564426069147885, + "learning_rate": 1.4875576036866362e-05, + "loss": 0.001, + "num_tokens": 6989245.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 335.875, + "completions/mean_terminated_length": 335.875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.14923445858697656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.02073127235053107, + "learning_rate": 1.4894009216589861e-05, + "loss": 0.0008, + "num_tokens": 6998452.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 222.5, + "completions/mean_terminated_length": 222.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.14941892639734367, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.03303106583189219, + "learning_rate": 1.4912442396313365e-05, + "loss": 0.0013, + "num_tokens": 7005352.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 433.75, + "completions/mean_terminated_length": 433.75, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.14960339420771077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03564453125, + "kl": 0.008066351554589346, + "learning_rate": 1.4930875576036868e-05, + "loss": 0.0003, + "num_tokens": 7013254.0, + "reward": 1.9090909957885742, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 216.375, + "completions/mean_terminated_length": 216.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.14978786201807784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.011897749034687877, + "learning_rate": 1.494930875576037e-05, + "loss": 0.0005, + "num_tokens": 7018321.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 281.625, + "completions/mean_terminated_length": 281.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.14997232982844494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.50390625, + "kl": 0.037915762164629996, + "learning_rate": 1.4967741935483873e-05, + "loss": 0.0015, + "num_tokens": 7027078.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 487.75, + "completions/mean_terminated_length": 487.75, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.15015679763881204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.014756782446056604, + "learning_rate": 1.4986175115207376e-05, + "loss": 0.0006, + "num_tokens": 7035412.0, + "reward": 1.6195652484893799, + "reward_std": 0.3381814956665039, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7445651888847351, + "rewards/fixed_code_pass_all_test_reward/std": 0.11251069605350494, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.1503412654491791, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.016000698611605912, + "learning_rate": 1.5004608294930876e-05, + "loss": 0.0006, + "num_tokens": 7041478.0, + "reward": 1.5535714626312256, + "reward_std": 0.4855791926383972, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8035714030265808, + "rewards/fixed_code_pass_all_test_reward/std": 0.3657134771347046, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 372.375, + "completions/mean_terminated_length": 372.375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.1505257332595462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.015778279514051974, + "learning_rate": 1.5023041474654379e-05, + "loss": 0.0006, + "num_tokens": 7049233.0, + "reward": 1.144736886024475, + "reward_std": 0.3465586304664612, + "rewards/fixed_code_pass_all_test_reward/mean": 0.14473684132099152, + "rewards/fixed_code_pass_all_test_reward/std": 0.34655866026878357, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 287.5, + "completions/mean_terminated_length": 287.5, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.1507102010699133, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.021435981965623796, + "learning_rate": 1.504147465437788e-05, + "loss": 0.0009, + "num_tokens": 7058541.0, + "reward": 1.5487804412841797, + "reward_std": 0.4442201256752014, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5487804412841797, + "rewards/fixed_code_pass_all_test_reward/std": 0.4442201256752014, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 280.75, + "completions/mean_terminated_length": 280.75, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.15089466888028039, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.01879781624302268, + "learning_rate": 1.5059907834101384e-05, + "loss": 0.0008, + "num_tokens": 7064475.0, + "reward": 1.899999976158142, + "reward_std": 0.2828426957130432, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 377.125, + "completions/mean_terminated_length": 377.125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.1510791366906475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.015623708255589008, + "learning_rate": 1.5078341013824887e-05, + "loss": 0.0006, + "num_tokens": 7072236.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 285.0, + "completions/mean_terminated_length": 285.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.15126360450101456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.01770372991450131, + "learning_rate": 1.5096774193548389e-05, + "loss": 0.0007, + "num_tokens": 7080564.0, + "reward": 1.84375, + "reward_std": 0.34092646837234497, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.34092649817466736, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 242.875, + "completions/mean_terminated_length": 242.875, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.15144807231138166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.03773631388321519, + "learning_rate": 1.511520737327189e-05, + "loss": 0.0015, + "num_tokens": 7088499.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 599.25, + "completions/mean_terminated_length": 599.25, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.15163254012174876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.011290738882962614, + "learning_rate": 1.5133640552995392e-05, + "loss": 0.0005, + "num_tokens": 7099669.0, + "reward": 1.8020833730697632, + "reward_std": 0.3240906596183777, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 191.5, + "completions/mean_terminated_length": 191.5, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.15181700793211583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.021938687015790492, + "learning_rate": 1.5152073732718895e-05, + "loss": 0.0009, + "num_tokens": 7106585.0, + "reward": 1.7592592239379883, + "reward_std": 0.24084247648715973, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7592592239379883, + "rewards/fixed_code_pass_all_test_reward/std": 0.24084246158599854, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 520.125, + "completions/mean_terminated_length": 520.125, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.15200147574248293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "kl": 0.0087799089087639, + "learning_rate": 1.5170506912442398e-05, + "loss": 0.0004, + "num_tokens": 7118442.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 230.25, + "completions/mean_terminated_length": 230.25, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.15218594355285003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.02226733387215063, + "learning_rate": 1.51889400921659e-05, + "loss": 0.0009, + "num_tokens": 7124268.0, + "reward": 1.0860215425491333, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08602150529623032, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 423.75, + "completions/mean_terminated_length": 423.75, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.1523704113632171, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.023682945990003645, + "learning_rate": 1.5207373271889403e-05, + "loss": 0.0009, + "num_tokens": 7136738.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 530.875, + "completions/mean_terminated_length": 530.875, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.1525548791735842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.011575250828173012, + "learning_rate": 1.5225806451612903e-05, + "loss": 0.0005, + "num_tokens": 7150273.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 153.875, + "completions/mean_terminated_length": 153.875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.1527393469839513, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.03281094157136977, + "learning_rate": 1.5244239631336406e-05, + "loss": 0.0013, + "num_tokens": 7154920.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1982.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 538.0, + "completions/mean_terminated_length": 538.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.15292381479431838, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.023295892227906734, + "learning_rate": 1.5262672811059907e-05, + "loss": 0.0009, + "num_tokens": 7166760.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 366.125, + "completions/mean_terminated_length": 366.125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.15310828260468548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.020283473888412118, + "learning_rate": 1.5281105990783412e-05, + "loss": 0.0008, + "num_tokens": 7175665.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 540.625, + "completions/mean_terminated_length": 325.2857360839844, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.15329275041505258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.81640625, + "kl": 0.029170521316700615, + "learning_rate": 1.5299539170506914e-05, + "loss": 0.0012, + "num_tokens": 7187518.0, + "reward": 1.0625, + "reward_std": 0.5629958510398865, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 227.625, + "completions/mean_terminated_length": 227.625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.15347721822541965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.022016245580744, + "learning_rate": 1.5317972350230415e-05, + "loss": 0.0009, + "num_tokens": 7195043.0, + "reward": 1.8333333730697632, + "reward_std": 0.30860665440559387, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.30860668420791626, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 189.75, + "completions/mean_terminated_length": 189.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.15366168603578675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.05257665412500501, + "learning_rate": 1.533640552995392e-05, + "loss": 0.0021, + "num_tokens": 7203385.0, + "reward": 1.933823585510254, + "reward_std": 0.18717533349990845, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9338235259056091, + "rewards/fixed_code_pass_all_test_reward/std": 0.18717533349990845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 523.5, + "completions/mean_terminated_length": 523.5, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.15384615384615385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.015067261992953718, + "learning_rate": 1.535483870967742e-05, + "loss": 0.0006, + "num_tokens": 7218389.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 192.375, + "completions/mean_terminated_length": 192.375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.15403062165652093, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.01656091643963009, + "learning_rate": 1.5373271889400923e-05, + "loss": 0.0007, + "num_tokens": 7222752.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 342.625, + "completions/mean_terminated_length": 342.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.15421508946688803, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.023286007111892104, + "learning_rate": 1.5391705069124425e-05, + "loss": 0.0009, + "num_tokens": 7233213.0, + "reward": 1.3842593431472778, + "reward_std": 0.24879682064056396, + "rewards/fixed_code_pass_all_test_reward/mean": 0.38425928354263306, + "rewards/fixed_code_pass_all_test_reward/std": 0.24879683554172516, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 340.75, + "completions/mean_terminated_length": 340.75, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.15439955727725513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.02792111097369343, + "learning_rate": 1.5410138248847926e-05, + "loss": 0.0011, + "num_tokens": 7257107.0, + "reward": 1.4811747074127197, + "reward_std": 0.3134256899356842, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4811747074127197, + "rewards/fixed_code_pass_all_test_reward/std": 0.3134257197380066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 104.0, + "completions/mean_terminated_length": 104.0, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.1545840250876222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.024813206517137587, + "learning_rate": 1.542857142857143e-05, + "loss": 0.001, + "num_tokens": 7260819.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 297.375, + "completions/mean_terminated_length": 297.375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.1547684928979893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.326171875, + "kl": 0.030742369475774467, + "learning_rate": 1.5447004608294933e-05, + "loss": 0.0012, + "num_tokens": 7269078.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 356.0, + "completions/mean_terminated_length": 356.0, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.1549529607083564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.023281393514480442, + "learning_rate": 1.5465437788018434e-05, + "loss": 0.0009, + "num_tokens": 7279718.0, + "reward": 1.7840908765792847, + "reward_std": 0.15923000872135162, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7840908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.15923000872135162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 264.625, + "completions/mean_terminated_length": 264.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.15513742851872347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.020685943658463657, + "learning_rate": 1.5483870967741936e-05, + "loss": 0.0008, + "num_tokens": 7285747.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 215.875, + "completions/mean_terminated_length": 215.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.15532189632909058, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.03081200725864619, + "learning_rate": 1.5502304147465438e-05, + "loss": 0.0012, + "num_tokens": 7292810.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 743.5, + "completions/mean_terminated_length": 308.66668701171875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.15550636413945768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.019801245478447527, + "learning_rate": 1.5520737327188942e-05, + "loss": 0.0008, + "num_tokens": 7304574.0, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 210.75, + "completions/mean_terminated_length": 210.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.15569083194982475, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.03433478041552007, + "learning_rate": 1.5539170506912444e-05, + "loss": 0.0014, + "num_tokens": 7312604.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 197.0, + "completions/mean_terminated_length": 197.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.15587529976019185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.04146273201331496, + "learning_rate": 1.5557603686635946e-05, + "loss": 0.0017, + "num_tokens": 7317076.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.15605976757055895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.014800441393163055, + "learning_rate": 1.5576036866359447e-05, + "loss": 0.0006, + "num_tokens": 7322162.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 163.75, + "completions/mean_terminated_length": 163.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.15624423538092602, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.06402567820623517, + "learning_rate": 1.559447004608295e-05, + "loss": 0.0026, + "num_tokens": 7330336.0, + "reward": 1.8897058963775635, + "reward_std": 0.31195884943008423, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, + "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 218.625, + "completions/mean_terminated_length": 218.625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.15642870319129312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.028404308948665857, + "learning_rate": 1.5612903225806454e-05, + "loss": 0.0011, + "num_tokens": 7340053.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.15661317100166022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.029230238869786263, + "learning_rate": 1.5631336405529955e-05, + "loss": 0.0012, + "num_tokens": 7345777.0, + "reward": 1.0446429252624512, + "reward_std": 0.12626908719539642, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0446428582072258, + "rewards/fixed_code_pass_all_test_reward/std": 0.12626907229423523, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 329.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.1567976388120273, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.030965921003371477, + "learning_rate": 1.5649769585253457e-05, + "loss": 0.0012, + "num_tokens": 7357582.0, + "reward": 1.3181817531585693, + "reward_std": 0.12856486439704895, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3181818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.12856487929821014, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 282.25, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.1569821066223944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.014108254690654576, + "learning_rate": 1.566820276497696e-05, + "loss": 0.0006, + "num_tokens": 7364544.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 343.125, + "completions/mean_terminated_length": 343.125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.1571665744327615, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6171875, + "kl": 0.025126139109488577, + "learning_rate": 1.568663594470046e-05, + "loss": 0.001, + "num_tokens": 7371721.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 368.5, + "completions/mean_terminated_length": 368.5, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.15735104224312857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.024267777684144676, + "learning_rate": 1.5705069124423965e-05, + "loss": 0.001, + "num_tokens": 7383245.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 403.5, + "completions/mean_terminated_length": 403.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.15753551005349567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.0269777748035267, + "learning_rate": 1.5723502304147466e-05, + "loss": 0.0011, + "num_tokens": 7392545.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 233.25, + "completions/mean_terminated_length": 233.25, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.15771997786386274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.02054202405270189, + "learning_rate": 1.5741935483870968e-05, + "loss": 0.0008, + "num_tokens": 7401379.0, + "reward": 1.3658536672592163, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3658536672592163, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 352.375, + "completions/mean_terminated_length": 352.375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.15790444567422984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1767578125, + "kl": 0.027560033951885998, + "learning_rate": 1.5760368663594473e-05, + "loss": 0.0011, + "num_tokens": 7410230.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 241.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.15808891348459694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.028376445872709155, + "learning_rate": 1.5778801843317974e-05, + "loss": 0.0011, + "num_tokens": 7416088.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 271.75, + "completions/mean_terminated_length": 271.75, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.15827338129496402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.02747140621067956, + "learning_rate": 1.5797235023041476e-05, + "loss": 0.0011, + "num_tokens": 7425726.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 243.625, + "completions/mean_terminated_length": 243.625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.15845784910533112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.019504684838466346, + "learning_rate": 1.5815668202764977e-05, + "loss": 0.0008, + "num_tokens": 7434059.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 564.25, + "completions/mean_terminated_length": 564.25, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.15864231691569822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.01576250314246863, + "learning_rate": 1.5834101382488482e-05, + "loss": 0.0006, + "num_tokens": 7444317.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.1588267847260653, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.03001510677859187, + "learning_rate": 1.5852534562211984e-05, + "loss": 0.0012, + "num_tokens": 7452921.0, + "reward": 1.471982717514038, + "reward_std": 0.5692220330238342, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7219827771186829, + "rewards/fixed_code_pass_all_test_reward/std": 0.4522782266139984, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 243.75, + "completions/mean_terminated_length": 243.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.1590112525364324, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1904296875, + "kl": 0.0391040020622313, + "learning_rate": 1.5870967741935485e-05, + "loss": 0.0016, + "num_tokens": 7461559.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 206.625, + "completions/mean_terminated_length": 206.625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.1591957203467995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.02142679668031633, + "learning_rate": 1.588940092165899e-05, + "loss": 0.0009, + "num_tokens": 7466756.0, + "reward": 1.96875, + "reward_std": 0.0578637570142746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 361.875, + "completions/mean_terminated_length": 361.875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.15938018815716656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.019700519274920225, + "learning_rate": 1.5907834101382488e-05, + "loss": 0.0008, + "num_tokens": 7477195.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 323.5, + "completions/mean_terminated_length": 323.5, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.15956465596753366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.021246141870506108, + "learning_rate": 1.5926267281105993e-05, + "loss": 0.0008, + "num_tokens": 7486887.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 221.25, + "completions/mean_terminated_length": 221.25, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.15974912377790076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.259765625, + "kl": 0.042252736166119576, + "learning_rate": 1.5944700460829495e-05, + "loss": 0.0017, + "num_tokens": 7494641.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 477.25, + "completions/mean_terminated_length": 477.25, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.15993359158826784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.01356708921957761, + "learning_rate": 1.5963133640552996e-05, + "loss": 0.0005, + "num_tokens": 7505811.0, + "reward": 1.138157844543457, + "reward_std": 0.055824220180511475, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1381578892469406, + "rewards/fixed_code_pass_all_test_reward/std": 0.055824216455221176, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 388.625, + "completions/mean_terminated_length": 388.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.16011805939863494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.02838135918136686, + "learning_rate": 1.59815668202765e-05, + "loss": 0.0011, + "num_tokens": 7512560.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 193.125, + "completions/mean_terminated_length": 193.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.16030252720900204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2275390625, + "kl": 0.045360136311501265, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0018, + "num_tokens": 7519905.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 303.625, + "completions/mean_terminated_length": 303.625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.1604869950193691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.06809098657686263, + "learning_rate": 1.6018433179723504e-05, + "loss": 0.0027, + "num_tokens": 7526606.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 330.5, + "completions/mean_terminated_length": 330.5, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.1606714628297362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.0310385919874534, + "learning_rate": 1.6036866359447006e-05, + "loss": 0.0012, + "num_tokens": 7533490.0, + "reward": 1.7872340679168701, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7872340679168701, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 433.125, + "completions/mean_terminated_length": 202.42857360839844, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.1608559306401033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.56640625, + "kl": 0.015134699031477794, + "learning_rate": 1.6055299539170507e-05, + "loss": 0.0006, + "num_tokens": 7542339.0, + "reward": 1.6111111640930176, + "reward_std": 0.6542045474052429, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7361111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.30441105365753174, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 391.5, + "completions/mean_terminated_length": 391.5, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.16104039845047038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.017901247541885823, + "learning_rate": 1.6073732718894012e-05, + "loss": 0.0007, + "num_tokens": 7550871.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 429.875, + "completions/mean_terminated_length": 429.875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.16122486626083748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.01244936982402578, + "learning_rate": 1.6092165898617514e-05, + "loss": 0.0005, + "num_tokens": 7560318.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.16140933407120459, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.026560049154795706, + "learning_rate": 1.6110599078341015e-05, + "loss": 0.0011, + "num_tokens": 7567487.0, + "reward": 1.0403225421905518, + "reward_std": 0.033390238881111145, + "rewards/fixed_code_pass_all_test_reward/mean": 0.04032257944345474, + "rewards/fixed_code_pass_all_test_reward/std": 0.03339026868343353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1468.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 529.125, + "completions/mean_terminated_length": 529.125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.16159380188157166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.015643664402887225, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.0006, + "num_tokens": 7579256.0, + "reward": 1.8660714626312256, + "reward_std": 0.09518492966890335, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8660714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.09518493711948395, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 303.875, + "completions/mean_terminated_length": 303.875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.16177826969193876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.01870645908638835, + "learning_rate": 1.614746543778802e-05, + "loss": 0.0007, + "num_tokens": 7585903.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 301.375, + "completions/mean_terminated_length": 301.375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.16196273750230586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.04789177072234452, + "learning_rate": 1.6165898617511523e-05, + "loss": 0.0019, + "num_tokens": 7596090.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.16214720531267293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.023532550549134612, + "learning_rate": 1.6184331797235025e-05, + "loss": 0.0009, + "num_tokens": 7603086.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 282.375, + "completions/mean_terminated_length": 282.375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.16233167312304003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.265625, + "kl": 0.05211366293951869, + "learning_rate": 1.6202764976958526e-05, + "loss": 0.0021, + "num_tokens": 7612193.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.16251614093340713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.04060445830691606, + "learning_rate": 1.622119815668203e-05, + "loss": 0.0016, + "num_tokens": 7619060.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 289.625, + "completions/mean_terminated_length": 289.625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.1627006087437742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.03118653572164476, + "learning_rate": 1.623963133640553e-05, + "loss": 0.0012, + "num_tokens": 7625497.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 301.625, + "completions/mean_terminated_length": 301.625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.1628850765541413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.007208866882137954, + "learning_rate": 1.6258064516129034e-05, + "loss": 0.0003, + "num_tokens": 7631334.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 118.25, + "completions/mean_terminated_length": 118.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.1630695443645084, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.078125, + "kl": 0.025451550958678126, + "learning_rate": 1.6276497695852536e-05, + "loss": 0.001, + "num_tokens": 7635096.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 447.125, + "completions/mean_terminated_length": 218.42857360839844, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.16325401217487548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.02505898952949792, + "learning_rate": 1.6294930875576037e-05, + "loss": 0.001, + "num_tokens": 7643441.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 307.625, + "completions/mean_terminated_length": 307.625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.16343847998524258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.019641457591205835, + "learning_rate": 1.6313364055299542e-05, + "loss": 0.0008, + "num_tokens": 7650422.0, + "reward": 1.51630437374115, + "reward_std": 0.4275406301021576, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5163043141365051, + "rewards/fixed_code_pass_all_test_reward/std": 0.4275406002998352, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 380.25, + "completions/mean_terminated_length": 380.25, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.16362294779560965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.01911249232944101, + "learning_rate": 1.6331797235023044e-05, + "loss": 0.0008, + "num_tokens": 7661544.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 368.625, + "completions/mean_terminated_length": 368.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.16380741560597675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.026515733799897134, + "learning_rate": 1.6350230414746545e-05, + "loss": 0.0011, + "num_tokens": 7672989.0, + "reward": 1.7777776718139648, + "reward_std": 0.31426966190338135, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 354.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.16399188341634385, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.02514504559803754, + "learning_rate": 1.6368663594470047e-05, + "loss": 0.001, + "num_tokens": 7682652.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 411.0, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.16417635122671093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.015608435205649585, + "learning_rate": 1.638709677419355e-05, + "loss": 0.0006, + "num_tokens": 7693236.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 464.625, + "completions/mean_terminated_length": 464.625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.16436081903707803, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.017180391238071024, + "learning_rate": 1.6405529953917053e-05, + "loss": 0.0007, + "num_tokens": 7705065.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 498.125, + "completions/mean_terminated_length": 498.125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.16454528684744513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.0068995543697383255, + "learning_rate": 1.6423963133640555e-05, + "loss": 0.0003, + "num_tokens": 7714090.0, + "reward": 1.971153736114502, + "reward_std": 0.03981149569153786, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.039811473339796066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 226.0, + "completions/mean_terminated_length": 226.0, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.1647297546578122, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.022971801343373954, + "learning_rate": 1.6442396313364056e-05, + "loss": 0.0009, + "num_tokens": 7721530.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 322.125, + "completions/mean_terminated_length": 322.125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.1649142224681793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.01748776837484911, + "learning_rate": 1.6460829493087558e-05, + "loss": 0.0007, + "num_tokens": 7728403.0, + "reward": 1.3499999046325684, + "reward_std": 0.1414213478565216, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 421.75, + "completions/mean_terminated_length": 421.75, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.1650986902785464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.021252246340736747, + "learning_rate": 1.647926267281106e-05, + "loss": 0.0009, + "num_tokens": 7736593.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 605.375, + "completions/mean_terminated_length": 605.375, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "epoch": 0.16528315808891347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8203125, + "kl": 0.010175477509619668, + "learning_rate": 1.6497695852534564e-05, + "loss": 0.0004, + "num_tokens": 7752908.0, + "reward": 1.59375, + "reward_std": 0.1293872892856598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.16546762589928057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.03583967941813171, + "learning_rate": 1.6516129032258066e-05, + "loss": 0.0014, + "num_tokens": 7760897.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 417.375, + "completions/mean_terminated_length": 417.375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.16565209370964767, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.014887030585668981, + "learning_rate": 1.6534562211981567e-05, + "loss": 0.0006, + "num_tokens": 7769916.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 440.5, + "completions/mean_terminated_length": 440.5, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.16583656152001475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.02132390032056719, + "learning_rate": 1.6552995391705072e-05, + "loss": 0.0009, + "num_tokens": 7779472.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 645.25, + "completions/mean_terminated_length": 645.25, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "epoch": 0.16602102933038185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.81640625, + "kl": 0.012361459608655423, + "learning_rate": 1.6571428571428574e-05, + "loss": 0.0005, + "num_tokens": 7795722.0, + "reward": 1.1141974925994873, + "reward_std": 0.09465660154819489, + "rewards/fixed_code_pass_all_test_reward/mean": 0.11419752240180969, + "rewards/fixed_code_pass_all_test_reward/std": 0.09465659409761429, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 235.75, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.16620549714074895, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.015072114358190447, + "learning_rate": 1.6589861751152075e-05, + "loss": 0.0006, + "num_tokens": 7801112.0, + "reward": 1.984375, + "reward_std": 0.04419417306780815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, + "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 210.25, + "completions/mean_terminated_length": 210.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.16638996495111602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.018616642453707755, + "learning_rate": 1.6608294930875577e-05, + "loss": 0.0007, + "num_tokens": 7805650.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 288.5, + "completions/mean_terminated_length": 288.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.16657443276148312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.0367014838848263, + "learning_rate": 1.662672811059908e-05, + "loss": 0.0015, + "num_tokens": 7811910.0, + "reward": 1.78125, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 294.75, + "completions/mean_terminated_length": 294.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.16675890057185022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.040688263485208154, + "learning_rate": 1.6645161290322583e-05, + "loss": 0.0016, + "num_tokens": 7821812.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 449.875, + "completions/mean_terminated_length": 449.875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.1669433683822173, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.63671875, + "kl": 0.01776444015558809, + "learning_rate": 1.6663594470046085e-05, + "loss": 0.0007, + "num_tokens": 7831315.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 285.625, + "completions/mean_terminated_length": 285.625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.1671278361925844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.021341740270145237, + "learning_rate": 1.6682027649769587e-05, + "loss": 0.0009, + "num_tokens": 7838448.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 215.0, + "completions/mean_terminated_length": 215.0, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.1673123040029515, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.043569871108047664, + "learning_rate": 1.6700460829493088e-05, + "loss": 0.0017, + "num_tokens": 7846448.0, + "reward": 1.7364864349365234, + "reward_std": 0.36368322372436523, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7364864349365234, + "rewards/fixed_code_pass_all_test_reward/std": 0.36368322372436523, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 519.125, + "completions/mean_terminated_length": 519.125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.16749677181331857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.009683635624242015, + "learning_rate": 1.671889400921659e-05, + "loss": 0.0004, + "num_tokens": 7855945.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 473.5, + "completions/mean_terminated_length": 248.57144165039062, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.16768123962368567, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.023402717532007955, + "learning_rate": 1.6737327188940095e-05, + "loss": 0.0009, + "num_tokens": 7864005.0, + "reward": 0.84375, + "reward_std": 0.5240969061851501, + "rewards/fixed_code_pass_all_test_reward/mean": 0.09375, + "rewards/fixed_code_pass_all_test_reward/std": 0.08258593827486038, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 164.5, + "completions/mean_terminated_length": 164.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.16786570743405277, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.03412551339715719, + "learning_rate": 1.6755760368663596e-05, + "loss": 0.0014, + "num_tokens": 7868161.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 255.375, + "completions/mean_terminated_length": 255.375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.16805017524441984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.04721176717430353, + "learning_rate": 1.6774193548387098e-05, + "loss": 0.0019, + "num_tokens": 7875724.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.16823464305478694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.017231334350071847, + "learning_rate": 1.6792626728110603e-05, + "loss": 0.0007, + "num_tokens": 7884198.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 231.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.16841911086515404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.04926277091726661, + "learning_rate": 1.68110599078341e-05, + "loss": 0.002, + "num_tokens": 7892316.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 235.125, + "completions/mean_terminated_length": 235.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.16860357867552112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.0403216271661222, + "learning_rate": 1.6829493087557606e-05, + "loss": 0.0016, + "num_tokens": 7901029.0, + "reward": 1.7999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 336.0, + "completions/mean_terminated_length": 336.0, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.16878804648588822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.025000788504257798, + "learning_rate": 1.6847926267281107e-05, + "loss": 0.001, + "num_tokens": 7908637.0, + "reward": 1.4700000286102295, + "reward_std": 0.3870769441127777, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4700000286102295, + "rewards/fixed_code_pass_all_test_reward/std": 0.3870770037174225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 289.875, + "completions/mean_terminated_length": 289.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.16897251429625532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.016198983183130622, + "learning_rate": 1.686635944700461e-05, + "loss": 0.0006, + "num_tokens": 7913836.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.1691569821066224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.03361594758462161, + "learning_rate": 1.6884792626728114e-05, + "loss": 0.0013, + "num_tokens": 7923414.0, + "reward": 1.3068182468414307, + "reward_std": 0.3070886433124542, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3068181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.3070886433124542, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 229.625, + "completions/mean_terminated_length": 229.625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.1693414499169895, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.027360130450688303, + "learning_rate": 1.6903225806451615e-05, + "loss": 0.0011, + "num_tokens": 7930899.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.1695259177273566, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.013544372864998877, + "learning_rate": 1.6921658986175117e-05, + "loss": 0.0005, + "num_tokens": 7936215.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1493.0, + "completions/max_terminated_length": 1493.0, + "completions/mean_length": 960.75, + "completions/mean_terminated_length": 960.75, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "epoch": 0.16971038553772366, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42578125, + "kl": 0.008179209427908063, + "learning_rate": 1.6940092165898618e-05, + "loss": 0.0003, + "num_tokens": 7956581.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 183.25, + "completions/mean_terminated_length": 183.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.16989485334809076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.04175469046458602, + "learning_rate": 1.695852534562212e-05, + "loss": 0.0017, + "num_tokens": 7965207.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 324.375, + "completions/mean_terminated_length": 324.375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.17007932115845784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.02670258254511282, + "learning_rate": 1.6976958525345625e-05, + "loss": 0.0011, + "num_tokens": 7974786.0, + "reward": 1.7025001049041748, + "reward_std": 0.3619687855243683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7024999856948853, + "rewards/fixed_code_pass_all_test_reward/std": 0.36196884512901306, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 227.5, + "completions/mean_terminated_length": 227.5, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.17026378896882494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.017540586995892227, + "learning_rate": 1.6995391705069126e-05, + "loss": 0.0007, + "num_tokens": 7979670.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 329.625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.17044825677919204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.021959692589007318, + "learning_rate": 1.7013824884792628e-05, + "loss": 0.0009, + "num_tokens": 7985315.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 363.5, + "completions/mean_terminated_length": 363.5, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.1706327245895591, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.0193521564360708, + "learning_rate": 1.703225806451613e-05, + "loss": 0.0008, + "num_tokens": 7993415.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 420.75, + "completions/mean_terminated_length": 420.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.1708171923999262, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.03135782788740471, + "learning_rate": 1.705069124423963e-05, + "loss": 0.0013, + "num_tokens": 8000829.0, + "reward": 1.6378676891326904, + "reward_std": 0.33580029010772705, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7628676891326904, + "rewards/fixed_code_pass_all_test_reward/std": 0.1857566386461258, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 289.25, + "completions/mean_terminated_length": 289.25, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.1710016602102933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.02334133314434439, + "learning_rate": 1.7069124423963136e-05, + "loss": 0.0009, + "num_tokens": 8009727.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 316.375, + "completions/mean_terminated_length": 316.375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.17118612802066038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.03564389212988317, + "learning_rate": 1.7087557603686637e-05, + "loss": 0.0014, + "num_tokens": 8019762.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 426.875, + "completions/mean_terminated_length": 426.875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.17137059583102748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.03187166718998924, + "learning_rate": 1.710599078341014e-05, + "loss": 0.0013, + "num_tokens": 8027985.0, + "reward": 1.25, + "reward_std": 0.13363061845302582, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 365.875, + "completions/mean_terminated_length": 365.875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.17155506364139458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.023763384553603828, + "learning_rate": 1.7124423963133644e-05, + "loss": 0.001, + "num_tokens": 8038664.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 318.125, + "completions/mean_terminated_length": 318.125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.17173953145176166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.02721288026077673, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.0011, + "num_tokens": 8046673.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 421.5, + "completions/mean_terminated_length": 189.1428680419922, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.17192399926212876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.018199846977950074, + "learning_rate": 1.7161290322580647e-05, + "loss": 0.0007, + "num_tokens": 8052885.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 363.0, + "completions/mean_terminated_length": 363.0, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.17210846707249586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.016387610288802534, + "learning_rate": 1.7179723502304148e-05, + "loss": 0.0007, + "num_tokens": 8059877.0, + "reward": 1.404411792755127, + "reward_std": 0.10218808799982071, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40441176295280457, + "rewards/fixed_code_pass_all_test_reward/std": 0.1021881252527237, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 514.5, + "completions/mean_terminated_length": 514.5, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.17229293488286293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.0280854522716254, + "learning_rate": 1.719815668202765e-05, + "loss": 0.0011, + "num_tokens": 8073113.0, + "reward": 1.6428570747375488, + "reward_std": 0.4948716461658478, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.49487167596817017, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 345.625, + "completions/mean_terminated_length": 345.625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.17247740269323003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.02991078153718263, + "learning_rate": 1.7216589861751155e-05, + "loss": 0.0012, + "num_tokens": 8081894.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 313.375, + "completions/mean_terminated_length": 313.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.17266187050359713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2001953125, + "kl": 0.045928434701636434, + "learning_rate": 1.7235023041474656e-05, + "loss": 0.0018, + "num_tokens": 8091401.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 329.875, + "completions/mean_terminated_length": 329.875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.1728463383139642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.03130838752258569, + "learning_rate": 1.7253456221198158e-05, + "loss": 0.0013, + "num_tokens": 8101128.0, + "reward": 1.2804054021835327, + "reward_std": 0.05393325537443161, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2804054021835327, + "rewards/fixed_code_pass_all_test_reward/std": 0.05393326282501221, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 800.75, + "completions/mean_terminated_length": 800.75, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "epoch": 0.1730308061243313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.008159483026247472, + "learning_rate": 1.727188940092166e-05, + "loss": 0.0003, + "num_tokens": 8116486.0, + "reward": 1.5119047164916992, + "reward_std": 0.23363162577152252, + "rewards/fixed_code_pass_all_test_reward/mean": 0.511904776096344, + "rewards/fixed_code_pass_all_test_reward/std": 0.23363162577152252, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 378.75, + "completions/mean_terminated_length": 378.75, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.1732152739346984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.01692582934629172, + "learning_rate": 1.729032258064516e-05, + "loss": 0.0007, + "num_tokens": 8126068.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 499.875, + "completions/mean_terminated_length": 499.875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.17339974174506548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.02368154004216194, + "learning_rate": 1.7308755760368666e-05, + "loss": 0.0009, + "num_tokens": 8137859.0, + "reward": 1.5297619104385376, + "reward_std": 0.7361450791358948, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6547619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.47941088676452637, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 221.125, + "completions/mean_terminated_length": 221.125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.17358420955543258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.029777121846564114, + "learning_rate": 1.7327188940092167e-05, + "loss": 0.0012, + "num_tokens": 8142756.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 551.5, + "completions/mean_terminated_length": 551.5, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.17376867736579968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.020690912671852857, + "learning_rate": 1.734562211981567e-05, + "loss": 0.0008, + "num_tokens": 8154976.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 357.625, + "completions/mean_terminated_length": 357.625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.17395314517616675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.028870258829556406, + "learning_rate": 1.736405529953917e-05, + "loss": 0.0012, + "num_tokens": 8162413.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 371.875, + "completions/mean_terminated_length": 371.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.17413761298653385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.029334601247683167, + "learning_rate": 1.7382488479262672e-05, + "loss": 0.0012, + "num_tokens": 8172324.0, + "reward": 1.68478262424469, + "reward_std": 0.31759119033813477, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6847826242446899, + "rewards/fixed_code_pass_all_test_reward/std": 0.31759122014045715, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 405.0, + "completions/mean_terminated_length": 405.0, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.17432208079690095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.01651646476238966, + "learning_rate": 1.7400921658986177e-05, + "loss": 0.0007, + "num_tokens": 8181116.0, + "reward": 1.0099999904632568, + "reward_std": 0.028284268453717232, + "rewards/fixed_code_pass_all_test_reward/mean": 0.009999999776482582, + "rewards/fixed_code_pass_all_test_reward/std": 0.02828427031636238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 652.625, + "completions/mean_terminated_length": 652.625, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "epoch": 0.17450654860726802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.53515625, + "kl": 0.0072012043674476445, + "learning_rate": 1.741935483870968e-05, + "loss": 0.0003, + "num_tokens": 8190713.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 188.0, + "completions/mean_terminated_length": 188.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.17469101641763513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.02335134509485215, + "learning_rate": 1.743778801843318e-05, + "loss": 0.0009, + "num_tokens": 8195569.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 428.5, + "completions/mean_terminated_length": 428.5, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.17487548422800223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.01473669579718262, + "learning_rate": 1.7456221198156685e-05, + "loss": 0.0006, + "num_tokens": 8207597.0, + "reward": 1.451388955116272, + "reward_std": 0.5865839123725891, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5763888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.23323412239551544, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 186.125, + "completions/mean_terminated_length": 186.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.1750599520383693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.03596255858428776, + "learning_rate": 1.7474654377880186e-05, + "loss": 0.0014, + "num_tokens": 8212958.0, + "reward": 1.3392858505249023, + "reward_std": 0.32448652386665344, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3392857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.3244866132736206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 262.75, + "completions/mean_terminated_length": 262.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.1752444198487364, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.02906238113064319, + "learning_rate": 1.7493087557603688e-05, + "loss": 0.0012, + "num_tokens": 8219676.0, + "reward": 1.5185184478759766, + "reward_std": 0.40668532252311707, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5185185670852661, + "rewards/fixed_code_pass_all_test_reward/std": 0.4066852927207947, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 275.375, + "completions/mean_terminated_length": 275.375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.1754288876591035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.03594231209717691, + "learning_rate": 1.751152073732719e-05, + "loss": 0.0014, + "num_tokens": 8226247.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 237.875, + "completions/mean_terminated_length": 237.875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.17561335546947057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.0609193128766492, + "learning_rate": 1.7529953917050694e-05, + "loss": 0.0024, + "num_tokens": 8235454.0, + "reward": 1.7635540962219238, + "reward_std": 0.046856433153152466, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7635542154312134, + "rewards/fixed_code_pass_all_test_reward/std": 0.046856485307216644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 212.0, + "completions/mean_terminated_length": 212.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.17579782327983767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.045941002666950226, + "learning_rate": 1.7548387096774196e-05, + "loss": 0.0018, + "num_tokens": 8241254.0, + "reward": 1.642045497894287, + "reward_std": 0.20535555481910706, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6420454978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.20535553991794586, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 128.625, + "completions/mean_terminated_length": 128.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.17598229109020475, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.234375, + "kl": 0.08620562660507858, + "learning_rate": 1.7566820276497697e-05, + "loss": 0.0034, + "num_tokens": 8244915.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.17616675890057185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.022723793517798185, + "learning_rate": 1.75852534562212e-05, + "loss": 0.0009, + "num_tokens": 8254203.0, + "reward": 1.7272727489471436, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 167.75, + "completions/mean_terminated_length": 167.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.17635122671093895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.020114279235713184, + "learning_rate": 1.76036866359447e-05, + "loss": 0.0008, + "num_tokens": 8258561.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 133.875, + "completions/mean_terminated_length": 133.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.17653569452130602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.050007963087409735, + "learning_rate": 1.7622119815668205e-05, + "loss": 0.002, + "num_tokens": 8265960.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 244.875, + "completions/mean_terminated_length": 244.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.17672016233167312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.049208341632038355, + "learning_rate": 1.7640552995391707e-05, + "loss": 0.002, + "num_tokens": 8273975.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.17690463014204022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.017374115181155503, + "learning_rate": 1.765898617511521e-05, + "loss": 0.0007, + "num_tokens": 8281676.0, + "reward": 1.390625, + "reward_std": 0.2471940815448761, + "rewards/fixed_code_pass_all_test_reward/mean": 0.390625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2471940815448761, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 253.75, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.1770890979524073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.02180948620662093, + "learning_rate": 1.7677419354838713e-05, + "loss": 0.0009, + "num_tokens": 8286834.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 411.375, + "completions/mean_terminated_length": 411.375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.1772735657627744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.017014401382766664, + "learning_rate": 1.7695852534562215e-05, + "loss": 0.0007, + "num_tokens": 8296205.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 191.25, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.1774580335731415, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.03613136557396501, + "learning_rate": 1.7714285714285717e-05, + "loss": 0.0014, + "num_tokens": 8300599.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 227.625, + "completions/mean_terminated_length": 227.625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.17764250138350857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.0538258976303041, + "learning_rate": 1.7732718894009218e-05, + "loss": 0.0022, + "num_tokens": 8309220.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 234.625, + "completions/mean_terminated_length": 234.625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.17782696919387567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.03132734901737422, + "learning_rate": 1.775115207373272e-05, + "loss": 0.0013, + "num_tokens": 8317369.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 361.75, + "completions/mean_terminated_length": 361.75, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.17801143700424277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.033077884931117296, + "learning_rate": 1.7769585253456225e-05, + "loss": 0.0013, + "num_tokens": 8324839.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 162.75, + "completions/mean_terminated_length": 162.75, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.17819590481460984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.045198181411251426, + "learning_rate": 1.7788018433179726e-05, + "loss": 0.0018, + "num_tokens": 8330933.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 274.5, + "completions/mean_terminated_length": 274.5, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.17838037262497694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.03309439099393785, + "learning_rate": 1.7806451612903228e-05, + "loss": 0.0013, + "num_tokens": 8337457.0, + "reward": 1.2635869979858398, + "reward_std": 0.3847690224647522, + "rewards/fixed_code_pass_all_test_reward/mean": 0.26358693838119507, + "rewards/fixed_code_pass_all_test_reward/std": 0.3847690224647522, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 665.625, + "completions/mean_terminated_length": 665.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.17856484043534404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.029766413033939898, + "learning_rate": 1.782488479262673e-05, + "loss": 0.0012, + "num_tokens": 8353790.0, + "reward": 1.2000000476837158, + "reward_std": 0.35456210374832153, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.3545621335506439, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.1787493082457111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.016769222274888307, + "learning_rate": 1.784331797235023e-05, + "loss": 0.0007, + "num_tokens": 8360333.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 483.75, + "completions/mean_terminated_length": 483.75, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.17893377605607821, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.02423131396062672, + "learning_rate": 1.7861751152073736e-05, + "loss": 0.001, + "num_tokens": 8370371.0, + "reward": 1.1071428060531616, + "reward_std": 0.14787116646766663, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 305.375, + "completions/mean_terminated_length": 305.375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.17911824386644531, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.031280478346161544, + "learning_rate": 1.7880184331797237e-05, + "loss": 0.0013, + "num_tokens": 8377446.0, + "reward": 1.3671875, + "reward_std": 0.39058035612106323, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3671875, + "rewards/fixed_code_pass_all_test_reward/std": 0.39058035612106323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 595.625, + "completions/mean_terminated_length": 595.625, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.1793027116768124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.58984375, + "kl": 0.010642128705512732, + "learning_rate": 1.789861751152074e-05, + "loss": 0.0004, + "num_tokens": 8391331.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 258.75, + "completions/mean_terminated_length": 258.75, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.1794871794871795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19921875, + "kl": 0.031383720925077796, + "learning_rate": 1.7917050691244244e-05, + "loss": 0.0013, + "num_tokens": 8396233.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 277.625, + "completions/mean_terminated_length": 277.625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.1796716472975466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.028886747430078685, + "learning_rate": 1.7935483870967742e-05, + "loss": 0.0012, + "num_tokens": 8402630.0, + "reward": 1.2347561120986938, + "reward_std": 0.10093256831169128, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23475609719753265, + "rewards/fixed_code_pass_all_test_reward/std": 0.10093259811401367, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 204.875, + "completions/mean_terminated_length": 204.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.17985611510791366, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.04797433433122933, + "learning_rate": 1.7953917050691247e-05, + "loss": 0.0019, + "num_tokens": 8408325.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 291.375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.18004058291828076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.03495860635302961, + "learning_rate": 1.7972350230414748e-05, + "loss": 0.0014, + "num_tokens": 8414248.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.18022505072864786, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.020019241026602685, + "learning_rate": 1.799078341013825e-05, + "loss": 0.0008, + "num_tokens": 8423828.0, + "reward": 1.5803570747375488, + "reward_std": 0.09687156975269318, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5803571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.09687161445617676, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 360.25, + "completions/mean_terminated_length": 360.25, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.18040951853901493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.013790995057206601, + "learning_rate": 1.8009216589861755e-05, + "loss": 0.0006, + "num_tokens": 8430670.0, + "reward": 1.375, + "reward_std": 0.3857583999633789, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3857583999633789, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 334.75, + "completions/mean_terminated_length": 334.75, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.18059398634938204, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.02370634861290455, + "learning_rate": 1.8027649769585256e-05, + "loss": 0.0009, + "num_tokens": 8437844.0, + "reward": 1.5892856121063232, + "reward_std": 0.1415758579969406, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.1415759027004242, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.18077845415974914, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.023717242409475148, + "learning_rate": 1.8046082949308758e-05, + "loss": 0.0009, + "num_tokens": 8441916.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 321.375, + "completions/mean_terminated_length": 321.375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.1809629219701162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.038024751702323556, + "learning_rate": 1.806451612903226e-05, + "loss": 0.0015, + "num_tokens": 8448047.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 258.0, + "completions/mean_terminated_length": 258.0, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.1811473897804833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.052003409480676055, + "learning_rate": 1.808294930875576e-05, + "loss": 0.0021, + "num_tokens": 8455935.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 230.0, + "completions/mean_terminated_length": 230.0, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.1813318575908504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.048348663724027574, + "learning_rate": 1.8101382488479266e-05, + "loss": 0.0019, + "num_tokens": 8462783.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 228.75, + "completions/mean_terminated_length": 228.75, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.18151632540121748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.039190820360090584, + "learning_rate": 1.8119815668202767e-05, + "loss": 0.0016, + "num_tokens": 8467469.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 327.0, + "completions/mean_terminated_length": 327.0, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.18170079321158458, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.375, + "kl": 0.14731870009563863, + "learning_rate": 1.813824884792627e-05, + "loss": 0.0059, + "num_tokens": 8476341.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 409.125, + "completions/mean_terminated_length": 409.125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.18188526102195168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.03180694949696772, + "learning_rate": 1.815668202764977e-05, + "loss": 0.0013, + "num_tokens": 8488214.0, + "reward": 1.1315789222717285, + "reward_std": 0.05626552551984787, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1315789520740509, + "rewards/fixed_code_pass_all_test_reward/std": 0.05626552179455757, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.18206972883231876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.02653311169706285, + "learning_rate": 1.8175115207373272e-05, + "loss": 0.0011, + "num_tokens": 8495040.0, + "reward": 1.942307710647583, + "reward_std": 0.061675652861595154, + "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.06167568638920784, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 423.25, + "completions/mean_terminated_length": 423.25, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.18225419664268586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90234375, + "kl": 0.014761926722712815, + "learning_rate": 1.8193548387096777e-05, + "loss": 0.0006, + "num_tokens": 8503658.0, + "reward": 1.7797619104385376, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 247.0, + "completions/mean_terminated_length": 247.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.18243866445305293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.03590187546797097, + "learning_rate": 1.8211981566820278e-05, + "loss": 0.0014, + "num_tokens": 8512146.0, + "reward": 1.0833332538604736, + "reward_std": 0.10212231427431107, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333283662796, + "rewards/fixed_code_pass_all_test_reward/std": 0.10212230682373047, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 218.125, + "completions/mean_terminated_length": 218.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.18262313226342003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.01883467345032841, + "learning_rate": 1.823041474654378e-05, + "loss": 0.0008, + "num_tokens": 8516755.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.18280760007378713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.04174447455443442, + "learning_rate": 1.8248847926267285e-05, + "loss": 0.0017, + "num_tokens": 8525993.0, + "reward": 0.8125, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 328.375, + "completions/mean_terminated_length": 328.375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.1829920678841542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.03661162592470646, + "learning_rate": 1.8267281105990783e-05, + "loss": 0.0015, + "num_tokens": 8531852.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 888.375, + "completions/mean_terminated_length": 888.375, + "completions/min_length": 764.0, + "completions/min_terminated_length": 764.0, + "epoch": 0.1831765356945213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.54296875, + "kl": 0.011283142201136798, + "learning_rate": 1.8285714285714288e-05, + "loss": 0.0005, + "num_tokens": 8547615.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 464.125, + "completions/mean_terminated_length": 464.125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.1833610035048884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.04849442606791854, + "learning_rate": 1.830414746543779e-05, + "loss": 0.0019, + "num_tokens": 8556472.0, + "reward": 0.6607142686843872, + "reward_std": 0.5555838942527771, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0357142873108387, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 366.875, + "completions/mean_terminated_length": 366.875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.18354547131525548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.04402253753505647, + "learning_rate": 1.832258064516129e-05, + "loss": 0.0018, + "num_tokens": 8566039.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 455.625, + "completions/mean_terminated_length": 455.625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.18372993912562258, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.008188151550712064, + "learning_rate": 1.8341013824884796e-05, + "loss": 0.0003, + "num_tokens": 8579276.0, + "reward": 1.8099173307418823, + "reward_std": 0.20320695638656616, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8099173307418823, + "rewards/fixed_code_pass_all_test_reward/std": 0.2032068967819214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 634.375, + "completions/mean_terminated_length": 634.375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.18391440693598968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.03221911733271554, + "learning_rate": 1.8359447004608297e-05, + "loss": 0.0013, + "num_tokens": 8594399.0, + "reward": 1.6534091234207153, + "reward_std": 0.6409562826156616, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7784091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.39285045862197876, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 331.875, + "completions/mean_terminated_length": 331.875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.18409887474635675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.60546875, + "kl": 0.07009333209134638, + "learning_rate": 1.83778801843318e-05, + "loss": 0.0028, + "num_tokens": 8601622.0, + "reward": 1.2222222089767456, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2222222238779068, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1127.0, + "completions/mean_length": 1154.625, + "completions/mean_terminated_length": 1027.0, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "epoch": 0.18428334255672385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.478515625, + "kl": 0.00859526326530613, + "learning_rate": 1.83963133640553e-05, + "loss": 0.0003, + "num_tokens": 8620883.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 475.875, + "completions/mean_terminated_length": 475.875, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.18446781036709095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.03287659399211407, + "learning_rate": 1.8414746543778802e-05, + "loss": 0.0013, + "num_tokens": 8630058.0, + "reward": 1.807692289352417, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 260.875, + "completions/mean_terminated_length": 260.875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.18465227817745802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.01720403239596635, + "learning_rate": 1.8433179723502307e-05, + "loss": 0.0007, + "num_tokens": 8638497.0, + "reward": 1.8790322542190552, + "reward_std": 0.34214845299720764, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8790322542190552, + "rewards/fixed_code_pass_all_test_reward/std": 0.34214845299720764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.18483674598782512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.03713111043907702, + "learning_rate": 1.845161290322581e-05, + "loss": 0.0015, + "num_tokens": 8646487.0, + "reward": 1.9147727489471436, + "reward_std": 0.20535552501678467, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.20535555481910706, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 257.375, + "completions/mean_terminated_length": 257.375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.18502121379819222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.05610498250462115, + "learning_rate": 1.847004608294931e-05, + "loss": 0.0022, + "num_tokens": 8656250.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 325.0, + "completions/mean_terminated_length": 325.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.1852056816085593, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.03201098367571831, + "learning_rate": 1.848847926267281e-05, + "loss": 0.0013, + "num_tokens": 8664434.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.1853901494189264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.02570451988140121, + "learning_rate": 1.8506912442396313e-05, + "loss": 0.001, + "num_tokens": 8670856.0, + "reward": 1.6881721019744873, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6881720423698425, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 302.75, + "completions/mean_terminated_length": 302.75, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.1855746172292935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.029941468965262175, + "learning_rate": 1.8525345622119818e-05, + "loss": 0.0012, + "num_tokens": 8676534.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 405.875, + "completions/mean_terminated_length": 405.875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.18575908503966057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.026903101592324674, + "learning_rate": 1.854377880184332e-05, + "loss": 0.0011, + "num_tokens": 8688949.0, + "reward": 1.966292142868042, + "reward_std": 0.06241482123732567, + "rewards/fixed_code_pass_all_test_reward/mean": 0.966292142868042, + "rewards/fixed_code_pass_all_test_reward/std": 0.062414851039648056, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 554.875, + "completions/mean_terminated_length": 554.875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.18594355285002767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.02123036654666066, + "learning_rate": 1.856221198156682e-05, + "loss": 0.0008, + "num_tokens": 8708660.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.18612802066039477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.029438258847221732, + "learning_rate": 1.8580645161290326e-05, + "loss": 0.0012, + "num_tokens": 8720059.0, + "reward": 1.402438998222351, + "reward_std": 0.20322878658771515, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40243905782699585, + "rewards/fixed_code_pass_all_test_reward/std": 0.20322881639003754, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 228.75, + "completions/mean_terminated_length": 228.75, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.18631248847076184, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.033820669865235686, + "learning_rate": 1.8599078341013824e-05, + "loss": 0.0014, + "num_tokens": 8724705.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.18649695628112894, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.05939379730261862, + "learning_rate": 1.861751152073733e-05, + "loss": 0.0024, + "num_tokens": 8733606.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.18668142409149605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.038287939270958304, + "learning_rate": 1.863594470046083e-05, + "loss": 0.0015, + "num_tokens": 8741323.0, + "reward": 1.6736111640930176, + "reward_std": 0.3493979275226593, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6736111640930176, + "rewards/fixed_code_pass_all_test_reward/std": 0.3493979573249817, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 411.0, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.18686589190186312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6171875, + "kl": 0.029156034695915878, + "learning_rate": 1.8654377880184332e-05, + "loss": 0.0012, + "num_tokens": 8751515.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.18705035971223022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.05682164872996509, + "learning_rate": 1.8672811059907837e-05, + "loss": 0.0023, + "num_tokens": 8759281.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 523.625, + "completions/mean_terminated_length": 523.625, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.18723482752259732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.05350975785404444, + "learning_rate": 1.869124423963134e-05, + "loss": 0.0021, + "num_tokens": 8769398.0, + "reward": 1.3318965435028076, + "reward_std": 0.13410648703575134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3318965435028076, + "rewards/fixed_code_pass_all_test_reward/std": 0.13410645723342896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 141.75, + "completions/mean_terminated_length": 141.75, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.1874192953329644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.03424076386727393, + "learning_rate": 1.870967741935484e-05, + "loss": 0.0014, + "num_tokens": 8773268.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 358.5, + "completions/mean_terminated_length": 358.5, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.1876037631433315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.02082297159358859, + "learning_rate": 1.872811059907834e-05, + "loss": 0.0008, + "num_tokens": 8778944.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 382.625, + "completions/mean_terminated_length": 382.625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.1877882309536986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.026023157639428973, + "learning_rate": 1.8746543778801843e-05, + "loss": 0.001, + "num_tokens": 8789269.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 150.875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.18797269876406567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.034676977433264256, + "learning_rate": 1.8764976958525348e-05, + "loss": 0.0014, + "num_tokens": 8795116.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 434.75, + "completions/mean_terminated_length": 434.75, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.18815716657443277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.02111108449753374, + "learning_rate": 1.878341013824885e-05, + "loss": 0.0008, + "num_tokens": 8801842.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.18834163438479984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.023721578298136592, + "learning_rate": 1.880184331797235e-05, + "loss": 0.0009, + "num_tokens": 8808060.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 396.875, + "completions/mean_terminated_length": 396.875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.18852610219516694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.04490457125939429, + "learning_rate": 1.8820276497695853e-05, + "loss": 0.0018, + "num_tokens": 8820187.0, + "reward": 1.6363636255264282, + "reward_std": 0.2571297287940979, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 606.75, + "completions/mean_terminated_length": 606.75, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.18871057000553404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.01960376021452248, + "learning_rate": 1.8838709677419354e-05, + "loss": 0.0008, + "num_tokens": 8832201.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 229.875, + "completions/mean_terminated_length": 229.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.1888950378159011, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.038473634980618954, + "learning_rate": 1.885714285714286e-05, + "loss": 0.0015, + "num_tokens": 8837048.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 885.625, + "completions/mean_terminated_length": 885.625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.1890795056262682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.435546875, + "kl": 0.014221677207387984, + "learning_rate": 1.887557603686636e-05, + "loss": 0.0006, + "num_tokens": 8852869.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 367.625, + "completions/mean_terminated_length": 367.625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.1892639734366353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.03798526444006711, + "learning_rate": 1.8894009216589862e-05, + "loss": 0.0015, + "num_tokens": 8861650.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 466.75, + "completions/mean_terminated_length": 466.75, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.18944844124700239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.024337452370673418, + "learning_rate": 1.8912442396313367e-05, + "loss": 0.001, + "num_tokens": 8870176.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 410.875, + "completions/mean_terminated_length": 410.875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.1896329090573695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.051637128461152315, + "learning_rate": 1.893087557603687e-05, + "loss": 0.0021, + "num_tokens": 8877895.0, + "reward": 1.3106060028076172, + "reward_std": 0.44499343633651733, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43560606241226196, + "rewards/fixed_code_pass_all_test_reward/std": 0.22903358936309814, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 221.75, + "completions/mean_terminated_length": 221.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.1898173768677366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.259765625, + "kl": 0.06695012468844652, + "learning_rate": 1.894930875576037e-05, + "loss": 0.0027, + "num_tokens": 8886357.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 436.625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.19000184467810366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.024263644823804498, + "learning_rate": 1.896774193548387e-05, + "loss": 0.001, + "num_tokens": 8892802.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 438.0, + "completions/mean_terminated_length": 438.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.19018631248847076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.02297888114117086, + "learning_rate": 1.8986175115207373e-05, + "loss": 0.0009, + "num_tokens": 8902466.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 260.5, + "completions/mean_terminated_length": 260.5, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.19037078029883786, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.04576683440245688, + "learning_rate": 1.9004608294930878e-05, + "loss": 0.0018, + "num_tokens": 8907334.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.19055524810920493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.028550986666232347, + "learning_rate": 1.902304147465438e-05, + "loss": 0.0011, + "num_tokens": 8911778.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.19073971591957203, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.048959774896502495, + "learning_rate": 1.904147465437788e-05, + "loss": 0.002, + "num_tokens": 8918270.0, + "reward": 1.5150001049041748, + "reward_std": 0.3342795968055725, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5149999856948853, + "rewards/fixed_code_pass_all_test_reward/std": 0.3342796266078949, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 337.875, + "completions/mean_terminated_length": 337.875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.19092418372993913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.08370061742607504, + "learning_rate": 1.9059907834101383e-05, + "loss": 0.0033, + "num_tokens": 8928165.0, + "reward": 1.7884615659713745, + "reward_std": 0.3916930854320526, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.391693115234375, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 159.375, + "completions/mean_terminated_length": 159.375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.1911086515403062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.306640625, + "kl": 0.04091053269803524, + "learning_rate": 1.9078341013824884e-05, + "loss": 0.0016, + "num_tokens": 8932136.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 427.125, + "completions/mean_terminated_length": 427.125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.1912931193506733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.04178572306409478, + "learning_rate": 1.909677419354839e-05, + "loss": 0.0017, + "num_tokens": 8942313.0, + "reward": 1.5625, + "reward_std": 0.4172614812850952, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 413.25, + "completions/mean_terminated_length": 413.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.1914775871610404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.016138374689035118, + "learning_rate": 1.911520737327189e-05, + "loss": 0.0006, + "num_tokens": 8952707.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 390.375, + "completions/mean_terminated_length": 390.375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.19166205497140748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.037209371803328395, + "learning_rate": 1.9133640552995392e-05, + "loss": 0.0015, + "num_tokens": 8960054.0, + "reward": 1.2805850505828857, + "reward_std": 0.01880606822669506, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2805851101875305, + "rewards/fixed_code_pass_all_test_reward/std": 0.018806030973792076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 478.375, + "completions/mean_terminated_length": 478.375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.19184652278177458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.041813582414761186, + "learning_rate": 1.9152073732718897e-05, + "loss": 0.0017, + "num_tokens": 8971665.0, + "reward": 1.7083332538604736, + "reward_std": 0.412545770406723, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.41254574060440063, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 468.375, + "completions/mean_terminated_length": 468.375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.19203099059214168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.026070336112752557, + "learning_rate": 1.91705069124424e-05, + "loss": 0.001, + "num_tokens": 8983484.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 186.25, + "completions/mean_terminated_length": 186.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.19221545840250875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.016754469892475754, + "learning_rate": 1.91889400921659e-05, + "loss": 0.0007, + "num_tokens": 8987806.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.19239992621287585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.042511930922046304, + "learning_rate": 1.9207373271889402e-05, + "loss": 0.0017, + "num_tokens": 8995737.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 328.0, + "completions/mean_terminated_length": 328.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.19258439402324296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.06580156041309237, + "learning_rate": 1.9225806451612907e-05, + "loss": 0.0026, + "num_tokens": 9004881.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 419.125, + "completions/mean_terminated_length": 419.125, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.19276886183361003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.03426528465934098, + "learning_rate": 1.9244239631336408e-05, + "loss": 0.0014, + "num_tokens": 9012754.0, + "reward": 1.6304347515106201, + "reward_std": 0.27002573013305664, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6304347515106201, + "rewards/fixed_code_pass_all_test_reward/std": 0.27002567052841187, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 354.875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.19295332964397713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.031216715462505817, + "learning_rate": 1.926267281105991e-05, + "loss": 0.0012, + "num_tokens": 9019601.0, + "reward": 1.5, + "reward_std": 0.360259473323822, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.19093996286392212, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 644.5, + "completions/mean_terminated_length": 644.5, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.19313779745434423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.02768910489976406, + "learning_rate": 1.928110599078341e-05, + "loss": 0.0011, + "num_tokens": 9031093.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1732.0, + "completions/max_terminated_length": 1732.0, + "completions/mean_length": 650.125, + "completions/mean_terminated_length": 650.125, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.1933222652647113, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.74609375, + "kl": 0.027823954238556325, + "learning_rate": 1.9299539170506913e-05, + "loss": 0.0011, + "num_tokens": 9044534.0, + "reward": 1.6875, + "reward_std": 0.42889204621315, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.14847105741500854, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 410.25, + "completions/mean_terminated_length": 410.25, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.1935067330750784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.026442745584063232, + "learning_rate": 1.9317972350230418e-05, + "loss": 0.0011, + "num_tokens": 9051968.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 566.25, + "completions/mean_terminated_length": 566.25, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.1936912008854455, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.035375138744711876, + "learning_rate": 1.933640552995392e-05, + "loss": 0.0014, + "num_tokens": 9062242.0, + "reward": 1.1120688915252686, + "reward_std": 0.07543544471263885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.11206896603107452, + "rewards/fixed_code_pass_all_test_reward/std": 0.07543543726205826, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 371.125, + "completions/mean_terminated_length": 371.125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.19387566869581258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.04167580115608871, + "learning_rate": 1.935483870967742e-05, + "loss": 0.0017, + "num_tokens": 9072155.0, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 214.75, + "completions/mean_terminated_length": 214.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.19406013650617968, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.5, + "kl": 0.17124384664930403, + "learning_rate": 1.9373271889400926e-05, + "loss": 0.0068, + "num_tokens": 9076641.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 360.375, + "completions/mean_terminated_length": 360.375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.19424460431654678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.056981916539371014, + "learning_rate": 1.9391705069124424e-05, + "loss": 0.0023, + "num_tokens": 9087012.0, + "reward": 1.1875, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 188.0, + "completions/mean_terminated_length": 188.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.19442907212691385, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.031192297814413905, + "learning_rate": 1.941013824884793e-05, + "loss": 0.0012, + "num_tokens": 9091548.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 351.125, + "completions/mean_terminated_length": 351.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.19461353993728095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.014351689431350678, + "learning_rate": 1.942857142857143e-05, + "loss": 0.0006, + "num_tokens": 9097245.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 131.75, + "completions/mean_terminated_length": 131.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.19479800774764802, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.030655331094749272, + "learning_rate": 1.9447004608294932e-05, + "loss": 0.0012, + "num_tokens": 9100955.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 180.125, + "completions/mean_terminated_length": 180.125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.19498247555801512, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.031449867179617286, + "learning_rate": 1.9465437788018437e-05, + "loss": 0.0013, + "num_tokens": 9105444.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 373.375, + "completions/mean_terminated_length": 373.375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.19516694336838222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.04106221976689994, + "learning_rate": 1.948387096774194e-05, + "loss": 0.0016, + "num_tokens": 9112991.0, + "reward": 1.6308139562606812, + "reward_std": 0.22077718377113342, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6308139562606812, + "rewards/fixed_code_pass_all_test_reward/std": 0.22077718377113342, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 286.625, + "completions/mean_terminated_length": 286.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.1953514111787493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4609375, + "kl": 0.053116932045668364, + "learning_rate": 1.950230414746544e-05, + "loss": 0.0021, + "num_tokens": 9119100.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 151.75, + "completions/mean_terminated_length": 151.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1955358789891164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.036159938434138894, + "learning_rate": 1.952073732718894e-05, + "loss": 0.0014, + "num_tokens": 9122994.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.1957203467994835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.046173508977517486, + "learning_rate": 1.9539170506912443e-05, + "loss": 0.0018, + "num_tokens": 9129401.0, + "reward": 1.2872340679168701, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.28723403811454773, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 357.375, + "completions/mean_terminated_length": 357.375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.19590481460985057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.03306983155198395, + "learning_rate": 1.9557603686635948e-05, + "loss": 0.0013, + "num_tokens": 9137284.0, + "reward": 0.7573529481887817, + "reward_std": 0.46762460470199585, + "rewards/fixed_code_pass_all_test_reward/mean": 0.007352941203862429, + "rewards/fixed_code_pass_all_test_reward/std": 0.013615001924335957, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.19608928242021767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.07243763422593474, + "learning_rate": 1.957603686635945e-05, + "loss": 0.0029, + "num_tokens": 9147275.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 238.0, + "completions/mean_terminated_length": 238.0, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.19627375023058477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.028038288466632366, + "learning_rate": 1.959447004608295e-05, + "loss": 0.0011, + "num_tokens": 9152371.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 253.25, + "completions/mean_terminated_length": 253.25, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.19645821804095184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.05310514033772051, + "learning_rate": 1.9612903225806452e-05, + "loss": 0.0021, + "num_tokens": 9164757.0, + "reward": 1.0258619785308838, + "reward_std": 0.015962397679686546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.025862067937850952, + "rewards/fixed_code_pass_all_test_reward/std": 0.01596241630613804, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 137.75, + "completions/mean_terminated_length": 137.75, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.19664268585131894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.08375988807529211, + "learning_rate": 1.9631336405529954e-05, + "loss": 0.0034, + "num_tokens": 9168731.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.19682715366168604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.044415622018277645, + "learning_rate": 1.964976958525346e-05, + "loss": 0.0018, + "num_tokens": 9179193.0, + "reward": 1.4614661931991577, + "reward_std": 0.5035419464111328, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4614661633968353, + "rewards/fixed_code_pass_all_test_reward/std": 0.5035419464111328, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 178.875, + "completions/mean_terminated_length": 178.875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.19701162147205312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.08485871041193604, + "learning_rate": 1.966820276497696e-05, + "loss": 0.0034, + "num_tokens": 9188704.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 221.25, + "completions/mean_terminated_length": 221.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.19719608928242022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2578125, + "kl": 0.11099720187485218, + "learning_rate": 1.9686635944700462e-05, + "loss": 0.0044, + "num_tokens": 9196498.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 295.375, + "completions/mean_terminated_length": 295.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.19738055709278732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.057019088184461, + "learning_rate": 1.9705069124423967e-05, + "loss": 0.0023, + "num_tokens": 9203085.0, + "reward": 1.9057971239089966, + "reward_std": 0.10070707648992538, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9057971239089966, + "rewards/fixed_code_pass_all_test_reward/std": 0.10070714354515076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.1975650249031544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.02561216347385198, + "learning_rate": 1.9723502304147465e-05, + "loss": 0.001, + "num_tokens": 9208636.0, + "reward": 1.9375, + "reward_std": 0.09449111670255661, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.09449111670255661, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.1977494927135215, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.03833519690670073, + "learning_rate": 1.974193548387097e-05, + "loss": 0.0015, + "num_tokens": 9214763.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 286.375, + "completions/mean_terminated_length": 286.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.1979339605238886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.08201934862881899, + "learning_rate": 1.976036866359447e-05, + "loss": 0.0033, + "num_tokens": 9224806.0, + "reward": 1.3684210777282715, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3684210479259491, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 457.25, + "completions/mean_terminated_length": 457.25, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.19811842833425566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.04412001185119152, + "learning_rate": 1.9778801843317973e-05, + "loss": 0.0018, + "num_tokens": 9236048.0, + "reward": 1.9008619785308838, + "reward_std": 0.036574505269527435, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9008620381355286, + "rewards/fixed_code_pass_all_test_reward/std": 0.03657449036836624, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.19830289614462276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.07448474573902786, + "learning_rate": 1.9797235023041478e-05, + "loss": 0.003, + "num_tokens": 9247186.0, + "reward": 1.4969512224197388, + "reward_std": 0.09395557641983032, + "rewards/fixed_code_pass_all_test_reward/mean": 0.49695122241973877, + "rewards/fixed_code_pass_all_test_reward/std": 0.09395559877157211, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 613.5, + "completions/mean_terminated_length": 613.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.19848736395498986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.03945091995410621, + "learning_rate": 1.981566820276498e-05, + "loss": 0.0016, + "num_tokens": 9260358.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 164.25, + "completions/mean_terminated_length": 164.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.19867183176535694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.04427030589431524, + "learning_rate": 1.983410138248848e-05, + "loss": 0.0018, + "num_tokens": 9264704.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.19885629957572404, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.1550848176702857, + "learning_rate": 1.9852534562211983e-05, + "loss": 0.0062, + "num_tokens": 9270305.0, + "reward": 1.1607142686843872, + "reward_std": 0.5634002089500427, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.33284705877304077, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 138.25, + "completions/mean_terminated_length": 138.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.19904076738609114, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.055282536428421736, + "learning_rate": 1.9870967741935484e-05, + "loss": 0.0022, + "num_tokens": 9274187.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 322.875, + "completions/mean_terminated_length": 322.875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1992252351964582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.248046875, + "kl": 0.08959878934547305, + "learning_rate": 1.988940092165899e-05, + "loss": 0.0036, + "num_tokens": 9283986.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 155.0, + "completions/mean_terminated_length": 155.0, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.1994097030068253, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.03125, + "kl": 0.1725041288882494, + "learning_rate": 1.990783410138249e-05, + "loss": 0.0069, + "num_tokens": 9292738.0, + "reward": 1.7817796468734741, + "reward_std": 0.4042939841747284, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7817796468734741, + "rewards/fixed_code_pass_all_test_reward/std": 0.4042940139770508, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 281.125, + "completions/mean_terminated_length": 281.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.1995941708171924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.08496152609586716, + "learning_rate": 1.9926267281105992e-05, + "loss": 0.0034, + "num_tokens": 9299315.0, + "reward": 1.6956522464752197, + "reward_std": 0.12297506630420685, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6956522464752197, + "rewards/fixed_code_pass_all_test_reward/std": 0.12297508865594864, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 262.25, + "completions/mean_terminated_length": 262.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.19977863862755948, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.10247335932217538, + "learning_rate": 1.9944700460829494e-05, + "loss": 0.0041, + "num_tokens": 9307925.0, + "reward": 1.7529070377349854, + "reward_std": 0.45756959915161133, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7529069781303406, + "rewards/fixed_code_pass_all_test_reward/std": 0.45756959915161133, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 586.25, + "completions/mean_terminated_length": 586.25, + "completions/min_length": 552.0, + "completions/min_terminated_length": 552.0, + "epoch": 0.19996310643792659, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.05561680067330599, + "learning_rate": 1.9963133640552995e-05, + "loss": 0.0022, + "num_tokens": 9319391.0, + "reward": 1.7999999523162842, + "reward_std": 0.33166247606277466, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.33166250586509705, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 349.125, + "completions/mean_terminated_length": 349.125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.20014757424829369, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.1019435403868556, + "learning_rate": 1.99815668202765e-05, + "loss": 0.0041, + "num_tokens": 9328320.0, + "reward": 1.5433673858642578, + "reward_std": 0.15308551490306854, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5433673858642578, + "rewards/fixed_code_pass_all_test_reward/std": 0.15308551490306854, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 396.625, + "completions/mean_terminated_length": 396.625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.20033204205866076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.09953207708895206, + "learning_rate": 2e-05, + "loss": 0.004, + "num_tokens": 9337957.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 792.625, + "completions/mean_terminated_length": 792.625, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.20051650986902786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.04149740864522755, + "learning_rate": 1.9999999481633253e-05, + "loss": 0.0017, + "num_tokens": 9354746.0, + "reward": 1.53125, + "reward_std": 0.5077524185180664, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.20070097767939493, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.259765625, + "kl": 0.11919962940737605, + "learning_rate": 1.9999997926533058e-05, + "loss": 0.0048, + "num_tokens": 9363459.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 277.375, + "completions/mean_terminated_length": 277.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.20088544548976203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.05876771756447852, + "learning_rate": 1.999999533469958e-05, + "loss": 0.0024, + "num_tokens": 9371918.0, + "reward": 1.3250000476837158, + "reward_std": 0.5625198483467102, + "rewards/fixed_code_pass_all_test_reward/mean": 0.45000001788139343, + "rewards/fixed_code_pass_all_test_reward/std": 0.2507132887840271, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 385.0, + "completions/mean_terminated_length": 385.0, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.20106991330012913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.06345975468866527, + "learning_rate": 1.9999991706133083e-05, + "loss": 0.0025, + "num_tokens": 9385942.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 266.125, + "completions/mean_terminated_length": 266.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2012543811104962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.09511996665969491, + "learning_rate": 1.9999987040833953e-05, + "loss": 0.0038, + "num_tokens": 9393279.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 148.5, + "completions/mean_terminated_length": 148.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.2014388489208633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.06794558092951775, + "learning_rate": 1.9999981338802665e-05, + "loss": 0.0027, + "num_tokens": 9397347.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 123.875, + "completions/mean_terminated_length": 123.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.2016233167312304, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.05465734051540494, + "learning_rate": 1.9999974600039814e-05, + "loss": 0.0022, + "num_tokens": 9401266.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 232.625, + "completions/mean_terminated_length": 232.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.20180778454159748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05780915659852326, + "learning_rate": 1.9999966824546095e-05, + "loss": 0.0023, + "num_tokens": 9409711.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 177.375, + "completions/mean_terminated_length": 177.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.20199225235196458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.03219430800527334, + "learning_rate": 1.9999958012322317e-05, + "loss": 0.0013, + "num_tokens": 9414074.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 228.125, + "completions/mean_terminated_length": 228.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.20217672016233168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.051416757283732295, + "learning_rate": 1.9999948163369395e-05, + "loss": 0.0021, + "num_tokens": 9422523.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 140.375, + "completions/mean_terminated_length": 140.375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.20236118797269875, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.08728176075965166, + "learning_rate": 1.9999937277688347e-05, + "loss": 0.0035, + "num_tokens": 9426614.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 365.0, + "completions/mean_terminated_length": 365.0, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.20254565578306585, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.04686649201903492, + "learning_rate": 1.9999925355280302e-05, + "loss": 0.0019, + "num_tokens": 9437302.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 187.75, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.20273012359343295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.050774041563272476, + "learning_rate": 1.9999912396146495e-05, + "loss": 0.002, + "num_tokens": 9444812.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 107.0, + "completions/mean_terminated_length": 107.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.20291459140380003, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.125, + "kl": 0.13979557622224092, + "learning_rate": 1.9999898400288278e-05, + "loss": 0.0056, + "num_tokens": 9448540.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 235.0, + "completions/mean_terminated_length": 235.0, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.20309905921416713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.060022128745913506, + "learning_rate": 1.999988336770709e-05, + "loss": 0.0024, + "num_tokens": 9457828.0, + "reward": 1.655063271522522, + "reward_std": 0.05273657664656639, + "rewards/fixed_code_pass_all_test_reward/mean": 0.655063271522522, + "rewards/fixed_code_pass_all_test_reward/std": 0.05273657664656639, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 124.25, + "completions/mean_terminated_length": 124.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.20328352702453423, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.05706826550886035, + "learning_rate": 1.9999867298404498e-05, + "loss": 0.0023, + "num_tokens": 9461814.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 140.75, + "completions/mean_terminated_length": 140.75, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.2034679948349013, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.06405069655738771, + "learning_rate": 1.9999850192382163e-05, + "loss": 0.0026, + "num_tokens": 9466012.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.2036524626452684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.07642073556780815, + "learning_rate": 1.999983204964186e-05, + "loss": 0.0031, + "num_tokens": 9472015.0, + "reward": 1.8858695030212402, + "reward_std": 0.3228096067905426, + "rewards/fixed_code_pass_all_test_reward/mean": 0.885869562625885, + "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 117.125, + "completions/mean_terminated_length": 117.125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.2038369304556355, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8125, + "kl": 0.04644873715005815, + "learning_rate": 1.999981287018547e-05, + "loss": 0.0019, + "num_tokens": 9476672.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 222.125, + "completions/mean_terminated_length": 222.125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.20402139826600257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "kl": 0.05783520103432238, + "learning_rate": 1.999979265401498e-05, + "loss": 0.0023, + "num_tokens": 9486257.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 164.125, + "completions/mean_terminated_length": 164.125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.20420586607636967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.049751730635762215, + "learning_rate": 1.999977140113249e-05, + "loss": 0.002, + "num_tokens": 9494186.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 290.875, + "completions/mean_terminated_length": 290.875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.20439033388673677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.04597937222570181, + "learning_rate": 1.9999749111540197e-05, + "loss": 0.0018, + "num_tokens": 9502361.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 292.375, + "completions/mean_terminated_length": 292.375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.20457480169710385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.04796504694968462, + "learning_rate": 1.9999725785240423e-05, + "loss": 0.0019, + "num_tokens": 9509524.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.20475926950747095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.034556994680315256, + "learning_rate": 1.9999701422235574e-05, + "loss": 0.0014, + "num_tokens": 9516303.0, + "reward": 1.220588207244873, + "reward_std": 0.04159453138709068, + "rewards/fixed_code_pass_all_test_reward/mean": 0.22058823704719543, + "rewards/fixed_code_pass_all_test_reward/std": 0.04159452021121979, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 291.125, + "completions/mean_terminated_length": 291.125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.20494373731783805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.036939756479114294, + "learning_rate": 1.9999676022528178e-05, + "loss": 0.0015, + "num_tokens": 9524512.0, + "reward": 1.0833333730697632, + "reward_std": 0.1543033868074417, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430335700511932, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 392.625, + "completions/mean_terminated_length": 392.625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.20512820512820512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.024581013014540076, + "learning_rate": 1.9999649586120875e-05, + "loss": 0.001, + "num_tokens": 9538757.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 191.125, + "completions/mean_terminated_length": 191.125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.20531267293857222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.06351345451548696, + "learning_rate": 1.9999622113016402e-05, + "loss": 0.0025, + "num_tokens": 9543526.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.20549714074893932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.0589489764533937, + "learning_rate": 1.9999593603217605e-05, + "loss": 0.0024, + "num_tokens": 9555188.0, + "reward": 1.8499999046325684, + "reward_std": 0.29760950803756714, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.2976095378398895, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 108.25, + "completions/mean_terminated_length": 108.25, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.2056816085593064, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.0585943921469152, + "learning_rate": 1.9999564056727442e-05, + "loss": 0.0023, + "num_tokens": 9558974.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.2058660763696735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.04888463672250509, + "learning_rate": 1.9999533473548976e-05, + "loss": 0.002, + "num_tokens": 9566255.0, + "reward": 1.796875, + "reward_std": 0.3761144280433655, + "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3761144280433655, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 303.625, + "completions/mean_terminated_length": 303.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.2060505441800406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.031907302094623446, + "learning_rate": 1.9999501853685378e-05, + "loss": 0.0013, + "num_tokens": 9573884.0, + "reward": 1.2109375, + "reward_std": 0.29112139344215393, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2109375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2911214232444763, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.20623501199040767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.03965413593687117, + "learning_rate": 1.9999469197139928e-05, + "loss": 0.0016, + "num_tokens": 9583100.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 227.625, + "completions/mean_terminated_length": 227.625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.20641947980077477, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.11001327354460955, + "learning_rate": 1.9999435503916003e-05, + "loss": 0.0044, + "num_tokens": 9592249.0, + "reward": 0.771634578704834, + "reward_std": 0.7069153189659119, + "rewards/fixed_code_pass_all_test_reward/mean": 0.14663462340831757, + "rewards/fixed_code_pass_all_test_reward/std": 0.3258608281612396, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 150.25, + "completions/mean_terminated_length": 150.25, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.20660394761114187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.05270298011600971, + "learning_rate": 1.9999400774017105e-05, + "loss": 0.0021, + "num_tokens": 9596179.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 351.875, + "completions/mean_terminated_length": 351.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.20678841542150894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.021906271926127374, + "learning_rate": 1.9999365007446837e-05, + "loss": 0.0009, + "num_tokens": 9603410.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 334.625, + "completions/mean_terminated_length": 334.625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.20697288323187604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03369140625, + "kl": 0.021144459256902337, + "learning_rate": 1.9999328204208893e-05, + "loss": 0.0008, + "num_tokens": 9611079.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 251.0, + "completions/mean_terminated_length": 251.0, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.20715735104224312, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.030967160826548934, + "learning_rate": 1.9999290364307105e-05, + "loss": 0.0012, + "num_tokens": 9617599.0, + "reward": 1.2010868787765503, + "reward_std": 0.046115659177303314, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20108693838119507, + "rewards/fixed_code_pass_all_test_reward/std": 0.046115655452013016, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 238.375, + "completions/mean_terminated_length": 238.375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.20734181885261022, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.059347836300730705, + "learning_rate": 1.9999251487745386e-05, + "loss": 0.0024, + "num_tokens": 9626442.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 157.5, + "completions/mean_terminated_length": 157.5, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.20752628666297732, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "kl": 0.06781802047044039, + "learning_rate": 1.9999211574527767e-05, + "loss": 0.0027, + "num_tokens": 9631062.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.2077107544733444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2451171875, + "kl": 0.05504463938996196, + "learning_rate": 1.999917062465839e-05, + "loss": 0.0022, + "num_tokens": 9639241.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 216.25, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2078952222837115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.029047264833934605, + "learning_rate": 1.99991286381415e-05, + "loss": 0.0012, + "num_tokens": 9644307.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 170.25, + "completions/mean_terminated_length": 170.25, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2080796900940786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.216796875, + "kl": 0.07558548729866743, + "learning_rate": 1.9999085614981443e-05, + "loss": 0.003, + "num_tokens": 9653181.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 95.375, + "completions/mean_terminated_length": 95.375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.20826415790444566, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.796875, + "kl": 0.06220046244561672, + "learning_rate": 1.999904155518269e-05, + "loss": 0.0025, + "num_tokens": 9657432.0, + "reward": 0.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 110.75, + "completions/mean_terminated_length": 110.75, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.20844862571481276, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.0670712236315012, + "learning_rate": 1.99989964587498e-05, + "loss": 0.0027, + "num_tokens": 9661014.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 220.125, + "completions/mean_terminated_length": 220.125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.20863309352517986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.03478195331990719, + "learning_rate": 1.999895032568745e-05, + "loss": 0.0014, + "num_tokens": 9669807.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 543.125, + "completions/mean_terminated_length": 543.125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "epoch": 0.20881756133554694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.028197588166221976, + "learning_rate": 1.999890315600043e-05, + "loss": 0.0011, + "num_tokens": 9685200.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 124.75, + "completions/mean_terminated_length": 124.75, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.20900202914591404, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "kl": 0.10967082437127829, + "learning_rate": 1.999885494969362e-05, + "loss": 0.0044, + "num_tokens": 9688918.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 261.125, + "completions/mean_terminated_length": 261.125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.20918649695628114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.0661507723852992, + "learning_rate": 1.9998805706772022e-05, + "loss": 0.0026, + "num_tokens": 9697807.0, + "reward": 1.7840908765792847, + "reward_std": 0.10300161689519882, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7840909361839294, + "rewards/fixed_code_pass_all_test_reward/std": 0.10300164669752121, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 308.0, + "completions/mean_terminated_length": 308.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.2093709647666482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.025793679291382432, + "learning_rate": 1.9998755427240745e-05, + "loss": 0.001, + "num_tokens": 9707799.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 238.5, + "completions/mean_terminated_length": 238.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.2095554325770153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.04265776718966663, + "learning_rate": 1.9998704111104993e-05, + "loss": 0.0017, + "num_tokens": 9715835.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2097399003873824, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.03189303330145776, + "learning_rate": 1.9998651758370097e-05, + "loss": 0.0013, + "num_tokens": 9724722.0, + "reward": 1.5798611640930176, + "reward_std": 0.46420300006866455, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7048611044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.4181027114391327, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 280.0, + "completions/mean_terminated_length": 280.0, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.20992436819774948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.031910013407468796, + "learning_rate": 1.9998598369041474e-05, + "loss": 0.0013, + "num_tokens": 9735034.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 148.125, + "completions/mean_terminated_length": 148.125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.21010883600811658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.05286032170988619, + "learning_rate": 1.9998543943124667e-05, + "loss": 0.0021, + "num_tokens": 9739211.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.21029330381848368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.03506349853705615, + "learning_rate": 1.9998488480625314e-05, + "loss": 0.0014, + "num_tokens": 9746367.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.21047777162885076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.06959897326305509, + "learning_rate": 1.999843198154917e-05, + "loss": 0.0028, + "num_tokens": 9753772.0, + "reward": 1.4666666984558105, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 255.375, + "completions/mean_terminated_length": 255.375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.21066223943921786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.0557419213000685, + "learning_rate": 1.9998374445902084e-05, + "loss": 0.0022, + "num_tokens": 9761207.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 353.625, + "completions/mean_terminated_length": 353.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.21084670724958496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.02704835485201329, + "learning_rate": 1.999831587369003e-05, + "loss": 0.0011, + "num_tokens": 9770620.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 338.375, + "completions/mean_terminated_length": 338.375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.21103117505995203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.038854264887049794, + "learning_rate": 1.9998256264919072e-05, + "loss": 0.0016, + "num_tokens": 9777655.0, + "reward": 1.3177082538604736, + "reward_std": 0.16793787479400635, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3177083134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.16793787479400635, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 261.0, + "completions/mean_terminated_length": 261.0, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.21121564287031913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.030863576103001833, + "learning_rate": 1.9998195619595396e-05, + "loss": 0.0012, + "num_tokens": 9783967.0, + "reward": 1.545454502105713, + "reward_std": 0.500295102596283, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5454545617103577, + "rewards/fixed_code_pass_all_test_reward/std": 0.500295102596283, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 182.75, + "completions/mean_terminated_length": 182.75, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.21140011068068623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.04182061390019953, + "learning_rate": 1.9998133937725284e-05, + "loss": 0.0017, + "num_tokens": 9790077.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 299.0, + "completions/mean_terminated_length": 299.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2115845784910533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.02552569843828678, + "learning_rate": 1.999807121931514e-05, + "loss": 0.001, + "num_tokens": 9796813.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 394.75, + "completions/mean_terminated_length": 394.75, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.2117690463014204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.05180608527734876, + "learning_rate": 1.9998007464371458e-05, + "loss": 0.0021, + "num_tokens": 9808379.0, + "reward": 1.5208333730697632, + "reward_std": 0.058925606310367584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5208333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 235.75, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2119535141117875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.07691930164583027, + "learning_rate": 1.999794267290085e-05, + "loss": 0.0031, + "num_tokens": 9817545.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 299.125, + "completions/mean_terminated_length": 299.125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.21213798192215458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23828125, + "kl": 0.0753004492726177, + "learning_rate": 1.999787684491003e-05, + "loss": 0.003, + "num_tokens": 9827178.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 855.625, + "completions/mean_terminated_length": 685.2857666015625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.21232244973252168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.015278811217285693, + "learning_rate": 1.999780998040583e-05, + "loss": 0.0006, + "num_tokens": 9841103.0, + "reward": 0.689453125, + "reward_std": 0.5716906189918518, + "rewards/fixed_code_pass_all_test_reward/mean": 0.064453125, + "rewards/fixed_code_pass_all_test_reward/std": 0.06105329096317291, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 601.5, + "completions/mean_terminated_length": 601.5, + "completions/min_length": 543.0, + "completions/min_terminated_length": 543.0, + "epoch": 0.21250691754288878, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.024926221230998635, + "learning_rate": 1.9997742079395178e-05, + "loss": 0.001, + "num_tokens": 9852283.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 338.375, + "completions/mean_terminated_length": 338.375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.21269138535325585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.03114215424284339, + "learning_rate": 1.9997673141885113e-05, + "loss": 0.0012, + "num_tokens": 9860758.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 295.5, + "completions/mean_terminated_length": 295.5, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.21287585316362295, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.13573351502418518, + "learning_rate": 1.999760316788278e-05, + "loss": 0.0054, + "num_tokens": 9867538.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 167.125, + "completions/mean_terminated_length": 167.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.21306032097399003, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.07483479520305991, + "learning_rate": 1.9997532157395432e-05, + "loss": 0.003, + "num_tokens": 9871699.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 446.0, + "completions/mean_terminated_length": 217.1428680419922, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.21324478878435713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.04914769879542291, + "learning_rate": 1.9997460110430443e-05, + "loss": 0.002, + "num_tokens": 9880851.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 348.5, + "completions/mean_terminated_length": 348.5, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.21342925659472423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.04019129183143377, + "learning_rate": 1.999738702699527e-05, + "loss": 0.0016, + "num_tokens": 9887631.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 373.125, + "completions/mean_terminated_length": 373.125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.2136137244050913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.02923118695616722, + "learning_rate": 1.9997312907097495e-05, + "loss": 0.0012, + "num_tokens": 9911400.0, + "reward": 1.5873494148254395, + "reward_std": 0.20349884033203125, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5873494148254395, + "rewards/fixed_code_pass_all_test_reward/std": 0.20349884033203125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 318.375, + "completions/mean_terminated_length": 318.375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.2137981922154584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.0431978995911777, + "learning_rate": 1.9997237750744797e-05, + "loss": 0.0017, + "num_tokens": 9921763.0, + "reward": 1.9629629850387573, + "reward_std": 0.10475657135248184, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9629629850387573, + "rewards/fixed_code_pass_all_test_reward/std": 0.10475657135248184, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 177.125, + "completions/mean_terminated_length": 177.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2139826600258255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.043124674586579204, + "learning_rate": 1.999716155794498e-05, + "loss": 0.0017, + "num_tokens": 9925956.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 282.375, + "completions/mean_terminated_length": 282.375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.21416712783619257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.06822628027293831, + "learning_rate": 1.9997084328705926e-05, + "loss": 0.0027, + "num_tokens": 9934799.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 620.125, + "completions/mean_terminated_length": 620.125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.21435159564655967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.03318669833242893, + "learning_rate": 1.9997006063035655e-05, + "loss": 0.0013, + "num_tokens": 9950768.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.21453606345692677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.0617794890422374, + "learning_rate": 1.9996926760942276e-05, + "loss": 0.0025, + "num_tokens": 9955060.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 358.375, + "completions/mean_terminated_length": 358.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.21472053126729385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.05164128262549639, + "learning_rate": 1.999684642243401e-05, + "loss": 0.0021, + "num_tokens": 9965975.0, + "reward": 1.3295453786849976, + "reward_std": 0.42898935079574585, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32954543828964233, + "rewards/fixed_code_pass_all_test_reward/std": 0.42898938059806824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.21490499907766095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.052842136239632964, + "learning_rate": 1.999676504751919e-05, + "loss": 0.0021, + "num_tokens": 9970750.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1473.0, + "completions/max_terminated_length": 1473.0, + "completions/mean_length": 447.25, + "completions/mean_terminated_length": 447.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.21508946688802805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.03744844766333699, + "learning_rate": 1.999668263620625e-05, + "loss": 0.0015, + "num_tokens": 9981176.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 259.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.21527393469839512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.04440966295078397, + "learning_rate": 1.9996599188503728e-05, + "loss": 0.0018, + "num_tokens": 9986293.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 263.875, + "completions/mean_terminated_length": 263.875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.21545840250876222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.036768265534192324, + "learning_rate": 1.9996514704420286e-05, + "loss": 0.0015, + "num_tokens": 9995380.0, + "reward": 1.4945652484893799, + "reward_std": 0.2436751127243042, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4945652186870575, + "rewards/fixed_code_pass_all_test_reward/std": 0.24367506802082062, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 232.625, + "completions/mean_terminated_length": 232.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.21564287031912932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.032264088455121964, + "learning_rate": 1.9996429183964673e-05, + "loss": 0.0013, + "num_tokens": 10004265.0, + "reward": 1.3355263471603394, + "reward_std": 0.539634108543396, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46052634716033936, + "rewards/fixed_code_pass_all_test_reward/std": 0.18608075380325317, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 131.375, + "completions/mean_terminated_length": 131.375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2158273381294964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.05429630680009723, + "learning_rate": 1.999634262714576e-05, + "loss": 0.0022, + "num_tokens": 10007988.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 322.0, + "completions/mean_terminated_length": 322.0, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.2160118059398635, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.023800981929525733, + "learning_rate": 1.9996255033972524e-05, + "loss": 0.001, + "num_tokens": 10017148.0, + "reward": 1.854567289352417, + "reward_std": 0.27464327216148376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.854567289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.27464327216148376, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 99.75, + "completions/mean_terminated_length": 99.75, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.2161962737502306, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.515625, + "kl": 0.31232333090156317, + "learning_rate": 1.999616640445404e-05, + "loss": 0.0125, + "num_tokens": 10020602.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 307.875, + "completions/mean_terminated_length": 307.875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.21638074156059767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.04309866961557418, + "learning_rate": 1.99960767385995e-05, + "loss": 0.0017, + "num_tokens": 10031625.0, + "reward": 1.5671296119689941, + "reward_std": 0.3813115954399109, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5671296119689941, + "rewards/fixed_code_pass_all_test_reward/std": 0.3813115954399109, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 346.125, + "completions/mean_terminated_length": 346.125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.21656520937096477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9296875, + "kl": 0.0351749011897482, + "learning_rate": 1.9995986036418196e-05, + "loss": 0.0014, + "num_tokens": 10040522.0, + "reward": 1.8775510787963867, + "reward_std": 0.026720546185970306, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8775510191917419, + "rewards/fixed_code_pass_all_test_reward/std": 0.026720546185970306, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.21674967718133187, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.05946945585310459, + "learning_rate": 1.9995894297919536e-05, + "loss": 0.0024, + "num_tokens": 10044862.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 376.25, + "completions/mean_terminated_length": 376.25, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.21693414499169894, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.03526127338409424, + "learning_rate": 1.999580152311303e-05, + "loss": 0.0014, + "num_tokens": 10052264.0, + "reward": 1.1333333253860474, + "reward_std": 0.350962370634079, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, + "rewards/fixed_code_pass_all_test_reward/std": 0.35096240043640137, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 193.75, + "completions/mean_terminated_length": 193.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.21711861280206604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.05083850212395191, + "learning_rate": 1.9995707712008294e-05, + "loss": 0.002, + "num_tokens": 10056902.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 396.125, + "completions/mean_terminated_length": 396.125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.21730308061243314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.03851512516848743, + "learning_rate": 1.9995612864615056e-05, + "loss": 0.0015, + "num_tokens": 10064431.0, + "reward": 1.045454502105713, + "reward_std": 0.7606000900268555, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.44203564524650574, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 426.375, + "completions/mean_terminated_length": 426.375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.21748754842280021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.06132229929789901, + "learning_rate": 1.999551698094315e-05, + "loss": 0.0025, + "num_tokens": 10072346.0, + "reward": 1.9186046123504639, + "reward_std": 0.10765351355075836, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9186046123504639, + "rewards/fixed_code_pass_all_test_reward/std": 0.10765349864959717, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 396.625, + "completions/mean_terminated_length": 396.625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.21767201623316731, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.06797252083197236, + "learning_rate": 1.999542006100251e-05, + "loss": 0.0027, + "num_tokens": 10082103.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 637.0, + "completions/mean_terminated_length": 637.0, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.21785648404353442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.027162770507857203, + "learning_rate": 1.999532210480319e-05, + "loss": 0.0011, + "num_tokens": 10092703.0, + "reward": 1.6875, + "reward_std": 0.4381372928619385, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.4381372928619385, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2180409518539015, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.03437066404148936, + "learning_rate": 1.9995223112355347e-05, + "loss": 0.0014, + "num_tokens": 10098650.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 127.625, + "completions/mean_terminated_length": 127.625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.2182254196642686, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.10229458566755056, + "learning_rate": 1.9995123083669238e-05, + "loss": 0.0041, + "num_tokens": 10102487.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 557.625, + "completions/mean_terminated_length": 557.625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.2184098874746357, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.734375, + "kl": 0.03221483645029366, + "learning_rate": 1.999502201875524e-05, + "loss": 0.0013, + "num_tokens": 10112308.0, + "reward": 1.074519157409668, + "reward_std": 0.21077220141887665, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07451923191547394, + "rewards/fixed_code_pass_all_test_reward/std": 0.21077223122119904, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 221.375, + "completions/mean_terminated_length": 221.375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.21859435528500276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.04646780528128147, + "learning_rate": 1.9994919917623822e-05, + "loss": 0.0019, + "num_tokens": 10117519.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 402.625, + "completions/mean_terminated_length": 402.625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.21877882309536986, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.96875, + "kl": 0.43370580673217773, + "learning_rate": 1.9994816780285576e-05, + "loss": 0.0173, + "num_tokens": 10127476.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 332.125, + "completions/mean_terminated_length": 332.125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.21896329090573696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.0636683851480484, + "learning_rate": 1.9994712606751197e-05, + "loss": 0.0025, + "num_tokens": 10134413.0, + "reward": 1.375, + "reward_std": 0.39247557520866394, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.39247557520866394, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 249.125, + "completions/mean_terminated_length": 249.125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.21914775871610404, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.08024193253368139, + "learning_rate": 1.9994607397031477e-05, + "loss": 0.0032, + "num_tokens": 10140774.0, + "reward": 1.8571429252624512, + "reward_std": 0.15742090344429016, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.15742090344429016, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 169.75, + "completions/mean_terminated_length": 169.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.21933222652647114, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.05488387937657535, + "learning_rate": 1.9994501151137328e-05, + "loss": 0.0022, + "num_tokens": 10144972.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 140.25, + "completions/mean_terminated_length": 140.25, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.2195166943368382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.10198327852413058, + "learning_rate": 1.9994393869079765e-05, + "loss": 0.0041, + "num_tokens": 10148878.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 278.875, + "completions/mean_terminated_length": 278.875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.2197011621472053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.054983296198770404, + "learning_rate": 1.999428555086991e-05, + "loss": 0.0022, + "num_tokens": 10154141.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 530.875, + "completions/mean_terminated_length": 530.875, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.2198856299575724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76171875, + "kl": 0.03594265994615853, + "learning_rate": 1.999417619651899e-05, + "loss": 0.0014, + "num_tokens": 10164020.0, + "reward": 1.0750000476837158, + "reward_std": 0.10350986570119858, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07500000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.10350984334945679, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 551.5, + "completions/mean_terminated_length": 551.5, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.22007009776793948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.03255474101752043, + "learning_rate": 1.9994065806038345e-05, + "loss": 0.0013, + "num_tokens": 10176896.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 295.125, + "completions/mean_terminated_length": 295.125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.22025456557830658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.1157928598113358, + "learning_rate": 1.999395437943942e-05, + "loss": 0.0046, + "num_tokens": 10183313.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 368.25, + "completions/mean_terminated_length": 368.25, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.22043903338867368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.0562732694670558, + "learning_rate": 1.9993841916733764e-05, + "loss": 0.0023, + "num_tokens": 10194379.0, + "reward": 1.375, + "reward_std": 0.2883436381816864, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.288343608379364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.22062350119904076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.04111891775391996, + "learning_rate": 1.9993728417933044e-05, + "loss": 0.0016, + "num_tokens": 10199104.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 179.625, + "completions/mean_terminated_length": 179.625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.22080796900940786, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.08965247729793191, + "learning_rate": 1.9993613883049015e-05, + "loss": 0.0036, + "num_tokens": 10203285.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 545.875, + "completions/mean_terminated_length": 545.875, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.22099243681977496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.03836157638579607, + "learning_rate": 1.9993498312093557e-05, + "loss": 0.0015, + "num_tokens": 10213372.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 197.125, + "completions/mean_terminated_length": 197.125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.22117690463014203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.0786507367156446, + "learning_rate": 1.9993381705078657e-05, + "loss": 0.0031, + "num_tokens": 10217733.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 276.0, + "completions/mean_terminated_length": 276.0, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.22136137244050913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.09480518661439419, + "learning_rate": 1.9993264062016397e-05, + "loss": 0.0038, + "num_tokens": 10227477.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 512.875, + "completions/mean_terminated_length": 293.5714416503906, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.22154584025087623, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.64453125, + "kl": 0.0554881856078282, + "learning_rate": 1.999314538291897e-05, + "loss": 0.0022, + "num_tokens": 10238108.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 373.375, + "completions/mean_terminated_length": 373.375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.2217303080612433, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.04839163552969694, + "learning_rate": 1.999302566779869e-05, + "loss": 0.0019, + "num_tokens": 10245287.0, + "reward": 1.4345238208770752, + "reward_std": 0.4710178077220917, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4345238208770752, + "rewards/fixed_code_pass_all_test_reward/std": 0.47101783752441406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 338.875, + "completions/mean_terminated_length": 338.875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.2219147758716104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.0687980311922729, + "learning_rate": 1.9992904916667963e-05, + "loss": 0.0028, + "num_tokens": 10252230.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 229.5, + "completions/mean_terminated_length": 229.5, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2220992436819775, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.053124125581234694, + "learning_rate": 1.999278312953931e-05, + "loss": 0.0021, + "num_tokens": 10257602.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 607.375, + "completions/mean_terminated_length": 607.375, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.22228371149234458, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.04890149366110563, + "learning_rate": 1.9992660306425353e-05, + "loss": 0.002, + "num_tokens": 10268693.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2781743109226227, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 189.5, + "completions/mean_terminated_length": 189.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.22246817930271168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.04487239685840905, + "learning_rate": 1.999253644733883e-05, + "loss": 0.0018, + "num_tokens": 10273185.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 374.0, + "completions/mean_terminated_length": 374.0, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.22265264711307878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.044575080974027514, + "learning_rate": 1.9992411552292575e-05, + "loss": 0.0018, + "num_tokens": 10283377.0, + "reward": 1.46875, + "reward_std": 0.3830161690711975, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3830161690711975, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.22283711492344585, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.08674538531340659, + "learning_rate": 1.999228562129954e-05, + "loss": 0.0035, + "num_tokens": 10288386.0, + "reward": 1.7321428060531616, + "reward_std": 0.36967799067497253, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428060531616, + "rewards/fixed_code_pass_all_test_reward/std": 0.36967799067497253, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 521.0, + "completions/mean_terminated_length": 521.0, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.22302158273381295, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.05816185474395752, + "learning_rate": 1.999215865437279e-05, + "loss": 0.0023, + "num_tokens": 10300866.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 257.0, + "completions/mean_terminated_length": 257.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.22320605054418005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.06780396401882172, + "learning_rate": 1.9992030651525474e-05, + "loss": 0.0027, + "num_tokens": 10305818.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 329.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.22339051835454712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.03649728512391448, + "learning_rate": 1.999190161277087e-05, + "loss": 0.0015, + "num_tokens": 10312879.0, + "reward": 1.85326087474823, + "reward_std": 0.13931958377361298, + "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, + "rewards/fixed_code_pass_all_test_reward/std": 0.1393195539712906, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 344.0, + "completions/mean_terminated_length": 344.0, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.22357498616491422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.06630427250638604, + "learning_rate": 1.9991771538122354e-05, + "loss": 0.0027, + "num_tokens": 10319743.0, + "reward": 1.735795497894287, + "reward_std": 0.09326291084289551, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.09326295554637909, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 266.125, + "completions/mean_terminated_length": 266.125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.22375945397528132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.05690090078860521, + "learning_rate": 1.9991640427593412e-05, + "loss": 0.0023, + "num_tokens": 10330176.0, + "reward": 1.0535714626312256, + "reward_std": 0.10287558287382126, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714253783226, + "rewards/fixed_code_pass_all_test_reward/std": 0.10287559032440186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 246.875, + "completions/mean_terminated_length": 246.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2239439217856484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.0801225071772933, + "learning_rate": 1.9991508281197634e-05, + "loss": 0.0032, + "num_tokens": 10339495.0, + "reward": 1.3046875, + "reward_std": 0.2828575074672699, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3046875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828575074672699, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 131.5, + "completions/mean_terminated_length": 131.5, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.2241283895960155, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.388671875, + "kl": 0.12518868362531066, + "learning_rate": 1.999137509894872e-05, + "loss": 0.005, + "num_tokens": 10343251.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 126.375, + "completions/mean_terminated_length": 126.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.2243128574063826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.11864777142181993, + "learning_rate": 1.9991240880860484e-05, + "loss": 0.0047, + "num_tokens": 10347126.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 310.375, + "completions/mean_terminated_length": 310.375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.22449732521674967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.07289290986955166, + "learning_rate": 1.9991105626946834e-05, + "loss": 0.0029, + "num_tokens": 10353785.0, + "reward": 1.0178570747375488, + "reward_std": 0.05050762742757797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 354.125, + "completions/mean_terminated_length": 354.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.22468179302711677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.045955864479765296, + "learning_rate": 1.9990969337221794e-05, + "loss": 0.0018, + "num_tokens": 10360826.0, + "reward": 1.5125000476837158, + "reward_std": 0.5221863389015198, + "rewards/fixed_code_pass_all_test_reward/mean": 0.512499988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.5221863389015198, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 189.625, + "completions/mean_terminated_length": 189.625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.22486626083748387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.0777191398665309, + "learning_rate": 1.9990832011699496e-05, + "loss": 0.0031, + "num_tokens": 10365215.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 295.875, + "completions/mean_terminated_length": 295.875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.22505072864785094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.07428921409882605, + "learning_rate": 1.9990693650394176e-05, + "loss": 0.003, + "num_tokens": 10375246.0, + "reward": 1.258333444595337, + "reward_std": 0.5639289617538452, + "rewards/fixed_code_pass_all_test_reward/mean": 0.38333332538604736, + "rewards/fixed_code_pass_all_test_reward/std": 0.3347588777542114, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 357.625, + "completions/mean_terminated_length": 357.625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.22523519645821805, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.051717916037887335, + "learning_rate": 1.9990554253320177e-05, + "loss": 0.0021, + "num_tokens": 10385923.0, + "reward": 1.5643938779830933, + "reward_std": 0.36756086349487305, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5643938779830933, + "rewards/fixed_code_pass_all_test_reward/std": 0.36756089329719543, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 206.625, + "completions/mean_terminated_length": 206.625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.22541966426858512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.06573385745286942, + "learning_rate": 1.999041382049195e-05, + "loss": 0.0026, + "num_tokens": 10390840.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 516.625, + "completions/mean_terminated_length": 516.625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.22560413207895222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.04347809497267008, + "learning_rate": 1.9990272351924057e-05, + "loss": 0.0017, + "num_tokens": 10401357.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 549.25, + "completions/mean_terminated_length": 549.25, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.22578859988931932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.03982335375621915, + "learning_rate": 1.9990129847631162e-05, + "loss": 0.0016, + "num_tokens": 10416863.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 206.375, + "completions/mean_terminated_length": 206.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.2259730676996864, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.158511595800519, + "learning_rate": 1.9989986307628036e-05, + "loss": 0.0063, + "num_tokens": 10421690.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 300.375, + "completions/mean_terminated_length": 300.375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.2261575355100535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.05229078442789614, + "learning_rate": 1.998984173192957e-05, + "loss": 0.0021, + "num_tokens": 10428085.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 234.125, + "completions/mean_terminated_length": 234.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2263420033204206, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.10355564951896667, + "learning_rate": 1.9989696120550744e-05, + "loss": 0.0041, + "num_tokens": 10438262.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 396.0, + "completions/mean_terminated_length": 396.0, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.22652647113078767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.04168181819841266, + "learning_rate": 1.998954947350666e-05, + "loss": 0.0017, + "num_tokens": 10446662.0, + "reward": 1.5357143878936768, + "reward_std": 0.3561210036277771, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5357142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.3561210036277771, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 251.875, + "completions/mean_terminated_length": 251.875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.22671093894115477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.059747880324721336, + "learning_rate": 1.9989401790812516e-05, + "loss": 0.0024, + "num_tokens": 10454541.0, + "reward": 1.4090909957885742, + "reward_std": 0.4312196671962738, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40909087657928467, + "rewards/fixed_code_pass_all_test_reward/std": 0.4312196969985962, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.22689540675152187, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.037049652775749564, + "learning_rate": 1.9989253072483625e-05, + "loss": 0.0015, + "num_tokens": 10465016.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 197.25, + "completions/mean_terminated_length": 197.25, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.22707987456188894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.055796783650293946, + "learning_rate": 1.9989103318535403e-05, + "loss": 0.0022, + "num_tokens": 10469410.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.22726434237225604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.06423219572752714, + "learning_rate": 1.998895252898338e-05, + "loss": 0.0026, + "num_tokens": 10478417.0, + "reward": 1.90234375, + "reward_std": 0.2762135863304138, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90234375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2762135863304138, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 250.375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.22744881018262314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.14238657895475626, + "learning_rate": 1.9988800703843187e-05, + "loss": 0.0057, + "num_tokens": 10484500.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 384.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.2276332779929902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.03517152089625597, + "learning_rate": 1.998864784313056e-05, + "loss": 0.0014, + "num_tokens": 10492116.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 277.0, + "completions/mean_terminated_length": 277.0, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.2278177458033573, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.04612201149575412, + "learning_rate": 1.9988493946861355e-05, + "loss": 0.0018, + "num_tokens": 10500716.0, + "reward": 1.826171875, + "reward_std": 0.2656168043613434, + "rewards/fixed_code_pass_all_test_reward/mean": 0.826171875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2656168043613434, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2280022136137244, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.06437376234680414, + "learning_rate": 1.998833901505152e-05, + "loss": 0.0026, + "num_tokens": 10506959.0, + "reward": 1.9318182468414307, + "reward_std": 0.19284728169441223, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 295.125, + "completions/mean_terminated_length": 295.125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.2281866814240915, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.06153057049959898, + "learning_rate": 1.998818304771712e-05, + "loss": 0.0025, + "num_tokens": 10516288.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.2283711492344586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.06342857144773006, + "learning_rate": 1.998802604487433e-05, + "loss": 0.0025, + "num_tokens": 10522442.0, + "reward": 1.3145160675048828, + "reward_std": 0.42308980226516724, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3145161271095276, + "rewards/fixed_code_pass_all_test_reward/std": 0.4230898320674896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 294.875, + "completions/mean_terminated_length": 294.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2285556170448257, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21484375, + "kl": 0.04567981022410095, + "learning_rate": 1.998786800653941e-05, + "loss": 0.0018, + "num_tokens": 10531265.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 329.25, + "completions/mean_terminated_length": 329.25, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.22874008485519276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.050369120202958584, + "learning_rate": 1.998770893272876e-05, + "loss": 0.002, + "num_tokens": 10541315.0, + "reward": 1.9943182468414307, + "reward_std": 0.0035068909637629986, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9943181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.0035068909637629986, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 260.75, + "completions/mean_terminated_length": 260.75, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.22892455266555986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.03831752552650869, + "learning_rate": 1.9987548823458868e-05, + "loss": 0.0015, + "num_tokens": 10548433.0, + "reward": 1.8461538553237915, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.22910902047592696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.054408963304013014, + "learning_rate": 1.9987387678746334e-05, + "loss": 0.0022, + "num_tokens": 10557420.0, + "reward": 1.5, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 146.875, + "completions/mean_terminated_length": 146.875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.22929348828629403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.047366100596264005, + "learning_rate": 1.998722549860786e-05, + "loss": 0.0019, + "num_tokens": 10561259.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 263.25, + "completions/mean_terminated_length": 263.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.22947795609666113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.09962185856420547, + "learning_rate": 1.9987062283060264e-05, + "loss": 0.004, + "num_tokens": 10567613.0, + "reward": 1.9444444179534912, + "reward_std": 0.15713484585285187, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, + "rewards/fixed_code_pass_all_test_reward/std": 0.15713483095169067, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 549.25, + "completions/mean_terminated_length": 549.25, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "epoch": 0.22966242390702823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.023867676500231028, + "learning_rate": 1.9986898032120466e-05, + "loss": 0.001, + "num_tokens": 10578655.0, + "reward": 0.9208332896232605, + "reward_std": 0.37583237886428833, + "rewards/fixed_code_pass_all_test_reward/mean": 0.04583333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.05616727098822594, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2298468917173953, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.08998010493814945, + "learning_rate": 1.9986732745805494e-05, + "loss": 0.0036, + "num_tokens": 10586067.0, + "reward": 1.6666667461395264, + "reward_std": 0.2760262191295624, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2760262191295624, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.2300313595277624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.056372721679508686, + "learning_rate": 1.998656642413248e-05, + "loss": 0.0023, + "num_tokens": 10597049.0, + "reward": 1.295454502105713, + "reward_std": 0.13527704775333405, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.13527706265449524, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 209.375, + "completions/mean_terminated_length": 209.375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2302158273381295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.04929253668524325, + "learning_rate": 1.9986399067118673e-05, + "loss": 0.002, + "num_tokens": 10602252.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 195.125, + "completions/mean_terminated_length": 195.125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.23040029514849658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.03676022926811129, + "learning_rate": 1.9986230674781425e-05, + "loss": 0.0015, + "num_tokens": 10606877.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 221.5, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.23058476295886368, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.052048672921955585, + "learning_rate": 1.9986061247138188e-05, + "loss": 0.0021, + "num_tokens": 10612193.0, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.23076923076923078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0927588976919651, + "learning_rate": 1.998589078420653e-05, + "loss": 0.0037, + "num_tokens": 10618567.0, + "reward": 1.1576086282730103, + "reward_std": 0.06547567248344421, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15760868787765503, + "rewards/fixed_code_pass_all_test_reward/std": 0.06547567993402481, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.23095369857959785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.06392569048330188, + "learning_rate": 1.998571928600412e-05, + "loss": 0.0026, + "num_tokens": 10627246.0, + "reward": 1.75, + "reward_std": 0.37988796830177307, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.37988796830177307, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 380.875, + "completions/mean_terminated_length": 380.875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.23113816638996496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.026241635205224156, + "learning_rate": 1.9985546752548742e-05, + "loss": 0.001, + "num_tokens": 10634605.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 305.125, + "completions/mean_terminated_length": 305.125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.23132263420033206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.06730982707813382, + "learning_rate": 1.9985373183858282e-05, + "loss": 0.0027, + "num_tokens": 10644358.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 316.0, + "completions/mean_terminated_length": 316.0, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.23150710201069913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.03514389740303159, + "learning_rate": 1.9985198579950734e-05, + "loss": 0.0014, + "num_tokens": 10651454.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 202.0, + "completions/mean_terminated_length": 202.0, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.23169156982106623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.42578125, + "kl": 0.12064300291240215, + "learning_rate": 1.99850229408442e-05, + "loss": 0.0048, + "num_tokens": 10658958.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 292.25, + "completions/mean_terminated_length": 292.25, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.2318760376314333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.03367142647039145, + "learning_rate": 1.998484626655689e-05, + "loss": 0.0013, + "num_tokens": 10667432.0, + "reward": 1.933333396911621, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9333333373069763, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 263.0, + "completions/mean_terminated_length": 263.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.2320605054418004, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.05345094995573163, + "learning_rate": 1.998466855710712e-05, + "loss": 0.0021, + "num_tokens": 10675880.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 233.25, + "completions/mean_terminated_length": 233.25, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2322449732521675, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.048464043298736215, + "learning_rate": 1.9984489812513307e-05, + "loss": 0.0019, + "num_tokens": 10683234.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.23242944106253458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.05444542760960758, + "learning_rate": 1.9984310032793997e-05, + "loss": 0.0022, + "num_tokens": 10691129.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 251.625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.23261390887290168, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.08660539775155485, + "learning_rate": 1.9984129217967815e-05, + "loss": 0.0035, + "num_tokens": 10697134.0, + "reward": 1.5625, + "reward_std": 0.47715675830841064, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.47715675830841064, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.23279837668326878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.05137174506671727, + "learning_rate": 1.998394736805351e-05, + "loss": 0.0021, + "num_tokens": 10706314.0, + "reward": 1.5555555820465088, + "reward_std": 0.31426969170570374, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5555555820465088, + "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.23298284449363585, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.061082344967871904, + "learning_rate": 1.998376448306994e-05, + "loss": 0.0024, + "num_tokens": 10710504.0, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 571.5, + "completions/mean_terminated_length": 571.5, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "epoch": 0.23316731230400295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.01376206730492413, + "learning_rate": 1.998358056303606e-05, + "loss": 0.0006, + "num_tokens": 10725764.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 289.75, + "completions/mean_terminated_length": 289.75, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.23335178011437005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.0960350469686091, + "learning_rate": 1.9983395607970938e-05, + "loss": 0.0038, + "num_tokens": 10732306.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 189.25, + "completions/mean_terminated_length": 189.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.23353624792473712, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.0864503406919539, + "learning_rate": 1.998320961789375e-05, + "loss": 0.0035, + "num_tokens": 10739036.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 331.0, + "completions/mean_terminated_length": 331.0, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.23372071573510422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.033696640748530626, + "learning_rate": 1.998302259282378e-05, + "loss": 0.0013, + "num_tokens": 10749324.0, + "reward": 1.6590908765792847, + "reward_std": 0.21041366457939148, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6590908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.21041364967823029, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 163.75, + "completions/mean_terminated_length": 163.75, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.23390518354547132, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.148740918841213, + "learning_rate": 1.9982834532780414e-05, + "loss": 0.0059, + "num_tokens": 10755722.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 347.625, + "completions/mean_terminated_length": 347.625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.2340896513558384, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.065625402610749, + "learning_rate": 1.9982645437783152e-05, + "loss": 0.0026, + "num_tokens": 10767023.0, + "reward": 1.534926414489746, + "reward_std": 0.4382210373878479, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6599264740943909, + "rewards/fixed_code_pass_all_test_reward/std": 0.14609718322753906, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.2342741191662055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.04834607429802418, + "learning_rate": 1.9982455307851598e-05, + "loss": 0.0019, + "num_tokens": 10775143.0, + "reward": 1.921875, + "reward_std": 0.07867428660392761, + "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.07867428660392761, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 226.375, + "completions/mean_terminated_length": 226.375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.2344585869765726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.05608690553344786, + "learning_rate": 1.998226414300546e-05, + "loss": 0.0022, + "num_tokens": 10784618.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 274.0, + "completions/mean_terminated_length": 274.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.23464305478693967, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.08392241224646568, + "learning_rate": 1.9982071943264557e-05, + "loss": 0.0034, + "num_tokens": 10790762.0, + "reward": 1.2638888359069824, + "reward_std": 0.19641852378845215, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2638888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.19641855359077454, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 363.125, + "completions/mean_terminated_length": 363.125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.23482752259730677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.04167533200234175, + "learning_rate": 1.998187870864882e-05, + "loss": 0.0017, + "num_tokens": 10798339.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 161.25, + "completions/mean_terminated_length": 161.25, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.23501199040767387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.173828125, + "kl": 0.08124719653278589, + "learning_rate": 1.998168443917828e-05, + "loss": 0.0032, + "num_tokens": 10805213.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 145.875, + "completions/mean_terminated_length": 145.875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.23519645821804094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2294921875, + "kl": 0.11112395022064447, + "learning_rate": 1.9981489134873075e-05, + "loss": 0.0044, + "num_tokens": 10811764.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 357.25, + "completions/mean_terminated_length": 357.25, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.23538092602840804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.05705802049487829, + "learning_rate": 1.9981292795753453e-05, + "loss": 0.0023, + "num_tokens": 10818990.0, + "reward": 1.0806450843811035, + "reward_std": 0.08621326833963394, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08064515888690948, + "rewards/fixed_code_pass_all_test_reward/std": 0.08621330559253693, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 303.375, + "completions/mean_terminated_length": 303.375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.23556539383877514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.08685056306421757, + "learning_rate": 1.9981095421839772e-05, + "loss": 0.0035, + "num_tokens": 10830161.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 289.25, + "completions/mean_terminated_length": 289.25, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.23574986164914222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.0722743347287178, + "learning_rate": 1.9980897013152493e-05, + "loss": 0.0029, + "num_tokens": 10838619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 255.0, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.23593432945950932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.0664640236645937, + "learning_rate": 1.9980697569712187e-05, + "loss": 0.0027, + "num_tokens": 10847803.0, + "reward": 1.2770271301269531, + "reward_std": 0.14062222838401794, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2770270109176636, + "rewards/fixed_code_pass_all_test_reward/std": 0.14062219858169556, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 262.25, + "completions/mean_terminated_length": 262.25, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.23611879726987642, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.07362988218665123, + "learning_rate": 1.998049709153953e-05, + "loss": 0.0029, + "num_tokens": 10856341.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 195.0, + "completions/mean_terminated_length": 195.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.2363032650802435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.05044866888783872, + "learning_rate": 1.9980295578655303e-05, + "loss": 0.002, + "num_tokens": 10864117.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 196.625, + "completions/mean_terminated_length": 196.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.2364877328906106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.06563374446704984, + "learning_rate": 1.9980093031080403e-05, + "loss": 0.0026, + "num_tokens": 10868714.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 204.0, + "completions/mean_terminated_length": 204.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.2366722007009777, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.07010183844249696, + "learning_rate": 1.9979889448835825e-05, + "loss": 0.0028, + "num_tokens": 10875778.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.23685666851134476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.0588835421949625, + "learning_rate": 1.9979684831942677e-05, + "loss": 0.0024, + "num_tokens": 10883650.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.23704113632171186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.052842499455437064, + "learning_rate": 1.997947918042217e-05, + "loss": 0.0021, + "num_tokens": 10887483.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 256.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.23722560413207897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.07487613963894546, + "learning_rate": 1.997927249429563e-05, + "loss": 0.003, + "num_tokens": 10893719.0, + "reward": 1.4047620296478271, + "reward_std": 0.4929039478302002, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4047619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.4929039776325226, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 301.5, + "completions/mean_terminated_length": 301.5, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.23741007194244604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.0552777883131057, + "learning_rate": 1.997906477358448e-05, + "loss": 0.0022, + "num_tokens": 10903427.0, + "reward": 1.125, + "reward_std": 0.24800796806812286, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.24800795316696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 208.125, + "completions/mean_terminated_length": 208.125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.23759453975281314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.07676322432234883, + "learning_rate": 1.9978856018310253e-05, + "loss": 0.0031, + "num_tokens": 10911636.0, + "reward": 1.0074999332427979, + "reward_std": 0.02121318317949772, + "rewards/fixed_code_pass_all_test_reward/mean": 0.007499999832361937, + "rewards/fixed_code_pass_all_test_reward/std": 0.02121320366859436, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 511.375, + "completions/mean_terminated_length": 511.375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.2377790075631802, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.81640625, + "kl": 0.03342886897735298, + "learning_rate": 1.99786462284946e-05, + "loss": 0.0013, + "num_tokens": 10921079.0, + "reward": 1.4208333492279053, + "reward_std": 0.6381862759590149, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5458333492279053, + "rewards/fixed_code_pass_all_test_reward/std": 0.35542842745780945, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2379634753735473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.09023055341094732, + "learning_rate": 1.997843540415926e-05, + "loss": 0.0036, + "num_tokens": 10930633.0, + "reward": 1.9027777910232544, + "reward_std": 0.2749859392642975, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 288.5, + "completions/mean_terminated_length": 288.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.2381479431839144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.0638743401505053, + "learning_rate": 1.9978223545326097e-05, + "loss": 0.0026, + "num_tokens": 10942109.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 282.875, + "completions/mean_terminated_length": 282.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.23833241099428149, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.0773246050812304, + "learning_rate": 1.9978010652017074e-05, + "loss": 0.0031, + "num_tokens": 10950924.0, + "reward": 1.2333333492279053, + "reward_std": 0.09428088366985321, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23333334922790527, + "rewards/fixed_code_pass_all_test_reward/std": 0.0942809134721756, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 277.375, + "completions/mean_terminated_length": 277.375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.23851687880464859, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.04100760817527771, + "learning_rate": 1.997779672425426e-05, + "loss": 0.0016, + "num_tokens": 10960991.0, + "reward": 1.1006944179534912, + "reward_std": 0.18659758567810059, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1006944477558136, + "rewards/fixed_code_pass_all_test_reward/std": 0.18659763038158417, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 480.625, + "completions/mean_terminated_length": 480.625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.23870134661501569, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.03814176539890468, + "learning_rate": 1.9977581762059833e-05, + "loss": 0.0015, + "num_tokens": 10968364.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 288.625, + "completions/mean_terminated_length": 288.625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.23888581442538276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.031211954541504383, + "learning_rate": 1.9977365765456085e-05, + "loss": 0.0012, + "num_tokens": 10975145.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 305.375, + "completions/mean_terminated_length": 305.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.23907028223574986, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.09488718491047621, + "learning_rate": 1.9977148734465403e-05, + "loss": 0.0038, + "num_tokens": 10986532.0, + "reward": 1.8416666984558105, + "reward_std": 0.15911462903022766, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8416666388511658, + "rewards/fixed_code_pass_all_test_reward/std": 0.15911459922790527, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 193.625, + "completions/mean_terminated_length": 193.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.23925475004611696, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.05824032728560269, + "learning_rate": 1.9976930669110292e-05, + "loss": 0.0023, + "num_tokens": 10995177.0, + "reward": 1.0364582538604736, + "reward_std": 0.014731382951140404, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0364583358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.01473139226436615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 139.25, + "completions/mean_terminated_length": 139.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.23943921785648403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.03924765903502703, + "learning_rate": 1.9976711569413353e-05, + "loss": 0.0016, + "num_tokens": 10999179.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 445.25, + "completions/mean_terminated_length": 445.25, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.23962368566685113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.021552508231252432, + "learning_rate": 1.997649143539731e-05, + "loss": 0.0009, + "num_tokens": 11008221.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 468.875, + "completions/mean_terminated_length": 468.875, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.23980815347721823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.06330589158460498, + "learning_rate": 1.9976270267084974e-05, + "loss": 0.0025, + "num_tokens": 11017596.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2399926212875853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1796875, + "kl": 0.06948912097141147, + "learning_rate": 1.9976048064499283e-05, + "loss": 0.0028, + "num_tokens": 11023371.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 245.625, + "completions/mean_terminated_length": 245.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.2401770890979524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.07741766143590212, + "learning_rate": 1.997582482766327e-05, + "loss": 0.0031, + "num_tokens": 11029352.0, + "reward": 1.375, + "reward_std": 0.41924625635147095, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.41924628615379333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 441.25, + "completions/mean_terminated_length": 441.25, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.2403615569083195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.06529013020917773, + "learning_rate": 1.997560055660008e-05, + "loss": 0.0026, + "num_tokens": 11042450.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 256.25, + "completions/mean_terminated_length": 256.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.24054602471868658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.07117086555808783, + "learning_rate": 1.997537525133296e-05, + "loss": 0.0028, + "num_tokens": 11053252.0, + "reward": 1.6666667461395264, + "reward_std": 0.46386152505874634, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.4638615846633911, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 369.375, + "completions/mean_terminated_length": 369.375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.24073049252905368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.07933639688417315, + "learning_rate": 1.9975148911885275e-05, + "loss": 0.0032, + "num_tokens": 11065407.0, + "reward": 1.8705356121063232, + "reward_std": 0.1502397060394287, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8705357313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.1502397358417511, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 399.5, + "completions/mean_terminated_length": 399.5, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.24091496033942078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.07196844788268209, + "learning_rate": 1.9974921538280485e-05, + "loss": 0.0029, + "num_tokens": 11076619.0, + "reward": 1.5597825050354004, + "reward_std": 0.015371894463896751, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5597826242446899, + "rewards/fixed_code_pass_all_test_reward/std": 0.015371883288025856, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 312.25, + "completions/mean_terminated_length": 312.25, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.24109942814978785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04975930065847933, + "learning_rate": 1.9974693130542168e-05, + "loss": 0.002, + "num_tokens": 11084085.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.24128389596015495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.06995000224560499, + "learning_rate": 1.9974463688693994e-05, + "loss": 0.0028, + "num_tokens": 11094385.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 204.125, + "completions/mean_terminated_length": 204.125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.24146836377052205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.16098308051005006, + "learning_rate": 1.9974233212759758e-05, + "loss": 0.0064, + "num_tokens": 11103242.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 283.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.24165283158088913, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.03872451093047857, + "learning_rate": 1.9974001702763356e-05, + "loss": 0.0015, + "num_tokens": 11109930.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 187.5, + "completions/mean_terminated_length": 187.5, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.24183729939125623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.02910477132536471, + "learning_rate": 1.997376915872878e-05, + "loss": 0.0012, + "num_tokens": 11114430.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 229.5, + "completions/mean_terminated_length": 229.5, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.24202176720162333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.053528830176219344, + "learning_rate": 1.997353558068015e-05, + "loss": 0.0021, + "num_tokens": 11120962.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 338.5, + "completions/mean_terminated_length": 338.5, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.2422062350119904, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.06050563510507345, + "learning_rate": 1.9973300968641675e-05, + "loss": 0.0024, + "num_tokens": 11130142.0, + "reward": 1.861328125, + "reward_std": 0.20920459926128387, + "rewards/fixed_code_pass_all_test_reward/mean": 0.861328125, + "rewards/fixed_code_pass_all_test_reward/std": 0.20920461416244507, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 310.375, + "completions/mean_terminated_length": 310.375, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.2423907028223575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.021936406672466546, + "learning_rate": 1.9973065322637673e-05, + "loss": 0.0009, + "num_tokens": 11136449.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.2425751706327246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.06247883662581444, + "learning_rate": 1.9972828642692587e-05, + "loss": 0.0025, + "num_tokens": 11141647.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 189.0, + "completions/mean_terminated_length": 189.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.24275963844309167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.05816083587706089, + "learning_rate": 1.9972590928830945e-05, + "loss": 0.0023, + "num_tokens": 11146079.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.24294410625345877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.04927245411090553, + "learning_rate": 1.9972352181077393e-05, + "loss": 0.002, + "num_tokens": 11150410.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.24312857406382588, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.06258185068145394, + "learning_rate": 1.997211239945669e-05, + "loss": 0.0025, + "num_tokens": 11161355.0, + "reward": 1.586538553237915, + "reward_std": 0.3653123080730438, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5865384340286255, + "rewards/fixed_code_pass_all_test_reward/std": 0.3653123378753662, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 231.875, + "completions/mean_terminated_length": 231.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.24331304187419295, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.07633263152092695, + "learning_rate": 1.9971871583993684e-05, + "loss": 0.0031, + "num_tokens": 11170282.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 274.875, + "completions/mean_terminated_length": 274.875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.24349750968456005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.08423224557191133, + "learning_rate": 1.9971629734713346e-05, + "loss": 0.0034, + "num_tokens": 11179057.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 163.0, + "completions/max_terminated_length": 163.0, + "completions/mean_length": 141.0, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.24368197749492715, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.033582302974537015, + "learning_rate": 1.9971386851640754e-05, + "loss": 0.0013, + "num_tokens": 11182897.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 285.875, + "completions/mean_terminated_length": 285.875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.24386644530529422, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.0745147867128253, + "learning_rate": 1.997114293480108e-05, + "loss": 0.003, + "num_tokens": 11188192.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 198.875, + "completions/mean_terminated_length": 198.875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.24405091311566132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.052544296719133854, + "learning_rate": 1.9970897984219614e-05, + "loss": 0.0021, + "num_tokens": 11192615.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 184.25, + "completions/mean_terminated_length": 184.25, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.2442353809260284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.031214418821036816, + "learning_rate": 1.997065199992176e-05, + "loss": 0.0012, + "num_tokens": 11197049.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 224.875, + "completions/mean_terminated_length": 224.875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.2444198487363955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.03052685991860926, + "learning_rate": 1.9970404981933006e-05, + "loss": 0.0012, + "num_tokens": 11202376.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 385.25, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.2446043165467626, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.029898729408159852, + "learning_rate": 1.997015693027897e-05, + "loss": 0.0012, + "num_tokens": 11212338.0, + "reward": 1.658046007156372, + "reward_std": 0.25195762515068054, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6580460071563721, + "rewards/fixed_code_pass_all_test_reward/std": 0.25195759534835815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 451.25, + "completions/mean_terminated_length": 451.25, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.24478878435712967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.05232251877896488, + "learning_rate": 1.9969907844985366e-05, + "loss": 0.0021, + "num_tokens": 11224412.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 451.75, + "completions/mean_terminated_length": 451.75, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.24497325216749677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.03530004946514964, + "learning_rate": 1.9969657726078017e-05, + "loss": 0.0014, + "num_tokens": 11232050.0, + "reward": 1.9642857313156128, + "reward_std": 0.10101523250341415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 210.5, + "completions/mean_terminated_length": 210.5, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.24515771997786387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.05684534879401326, + "learning_rate": 1.9969406573582857e-05, + "loss": 0.0023, + "num_tokens": 11236686.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 308.0, + "completions/mean_terminated_length": 308.0, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.24534218778823094, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.07199973566457629, + "learning_rate": 1.9969154387525918e-05, + "loss": 0.0029, + "num_tokens": 11243822.0, + "reward": 1.8461538553237915, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 351.0, + "completions/mean_terminated_length": 351.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.24552665559859804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.046237635193392634, + "learning_rate": 1.996890116793335e-05, + "loss": 0.0018, + "num_tokens": 11253350.0, + "reward": 1.322115421295166, + "reward_std": 0.4202892482280731, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32211539149284363, + "rewards/fixed_code_pass_all_test_reward/std": 0.4202892780303955, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 363.5, + "completions/mean_terminated_length": 363.5, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.24571112340896514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.055426849983632565, + "learning_rate": 1.9968646914831402e-05, + "loss": 0.0022, + "num_tokens": 11265458.0, + "reward": 1.7842261791229248, + "reward_std": 0.03788074478507042, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7842261791229248, + "rewards/fixed_code_pass_all_test_reward/std": 0.03788072615861893, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 234.75, + "completions/mean_terminated_length": 234.75, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.24589559121933222, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.251953125, + "kl": 0.06481029884889722, + "learning_rate": 1.9968391628246436e-05, + "loss": 0.0026, + "num_tokens": 11272888.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 291.125, + "completions/mean_terminated_length": 291.125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.24608005902969932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.08548794966191053, + "learning_rate": 1.9968135308204917e-05, + "loss": 0.0034, + "num_tokens": 11281721.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 390.125, + "completions/mean_terminated_length": 390.125, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.24626452684006642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.033831899520009756, + "learning_rate": 1.9967877954733413e-05, + "loss": 0.0014, + "num_tokens": 11290178.0, + "reward": 1.9291666746139526, + "reward_std": 0.04520680010318756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9291666746139526, + "rewards/fixed_code_pass_all_test_reward/std": 0.045206762850284576, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 157.625, + "completions/mean_terminated_length": 157.625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2464489946504335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.04918299335986376, + "learning_rate": 1.9967619567858617e-05, + "loss": 0.002, + "num_tokens": 11294183.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 374.75, + "completions/mean_terminated_length": 374.75, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.2466334624608006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.025479455944150686, + "learning_rate": 1.9967360147607307e-05, + "loss": 0.001, + "num_tokens": 11301317.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 212.25, + "completions/mean_terminated_length": 212.25, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2468179302711677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.04540640325285494, + "learning_rate": 1.9967099694006384e-05, + "loss": 0.0018, + "num_tokens": 11305943.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 176.0, + "completions/mean_terminated_length": 176.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.24700239808153476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.05060215573757887, + "learning_rate": 1.9966838207082843e-05, + "loss": 0.002, + "num_tokens": 11310175.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 339.875, + "completions/mean_terminated_length": 339.875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.24718686589190186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.06601592618972063, + "learning_rate": 1.99665756868638e-05, + "loss": 0.0026, + "num_tokens": 11321286.0, + "reward": 1.0543477535247803, + "reward_std": 0.045004263520240784, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05434782803058624, + "rewards/fixed_code_pass_all_test_reward/std": 0.04500427842140198, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 160.5, + "completions/mean_terminated_length": 160.5, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.24737133370226896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.09385432559065521, + "learning_rate": 1.9966312133376466e-05, + "loss": 0.0038, + "num_tokens": 11325274.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 174.75, + "completions/mean_terminated_length": 174.75, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.24755580151263604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.053817325038835406, + "learning_rate": 1.996604754664817e-05, + "loss": 0.0022, + "num_tokens": 11329448.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 368.375, + "completions/mean_terminated_length": 368.375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.24774026932300314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.05339163402095437, + "learning_rate": 1.996578192670634e-05, + "loss": 0.0021, + "num_tokens": 11337467.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 264.75, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.24792473713337024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.08137089665979147, + "learning_rate": 1.996551527357851e-05, + "loss": 0.0033, + "num_tokens": 11346537.0, + "reward": 1.54347825050354, + "reward_std": 0.18446266651153564, + "rewards/fixed_code_pass_all_test_reward/mean": 0.54347825050354, + "rewards/fixed_code_pass_all_test_reward/std": 0.18446263670921326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 189.75, + "completions/mean_terminated_length": 189.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2481092049437373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.255859375, + "kl": 0.058191659627482295, + "learning_rate": 1.996524758729233e-05, + "loss": 0.0023, + "num_tokens": 11352279.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 247.0, + "completions/mean_terminated_length": 247.0, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2482936727541044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.1121706934645772, + "learning_rate": 1.996497886787555e-05, + "loss": 0.0045, + "num_tokens": 11361615.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 378.125, + "completions/mean_terminated_length": 378.125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.2484781405644715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.048165290616452694, + "learning_rate": 1.996470911535603e-05, + "loss": 0.0019, + "num_tokens": 11369528.0, + "reward": 1.7761627435684204, + "reward_std": 0.08958551287651062, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7761628031730652, + "rewards/fixed_code_pass_all_test_reward/std": 0.0895855501294136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 326.625, + "completions/mean_terminated_length": 326.625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.24866260837483858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.05890075536444783, + "learning_rate": 1.9964438329761736e-05, + "loss": 0.0024, + "num_tokens": 11379549.0, + "reward": 1.9509494304656982, + "reward_std": 0.13873612880706787, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9509493708610535, + "rewards/fixed_code_pass_all_test_reward/std": 0.13873615860939026, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 253.75, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.24884707618520568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.0925910216756165, + "learning_rate": 1.9964166511120736e-05, + "loss": 0.0037, + "num_tokens": 11387131.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 253.875, + "completions/mean_terminated_length": 253.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.24903154399557278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.06029292428866029, + "learning_rate": 1.9963893659461223e-05, + "loss": 0.0024, + "num_tokens": 11396186.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 127.875, + "completions/mean_terminated_length": 127.875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.24921601180593986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1875, + "kl": 0.04121057072188705, + "learning_rate": 1.9963619774811467e-05, + "loss": 0.0016, + "num_tokens": 11399993.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 261.75, + "completions/mean_terminated_length": 261.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.24940047961630696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0696862330660224, + "learning_rate": 1.996334485719988e-05, + "loss": 0.0028, + "num_tokens": 11408063.0, + "reward": 1.9166667461395264, + "reward_std": 0.15430331230163574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 582.75, + "completions/mean_terminated_length": 582.75, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "epoch": 0.24958494742667406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.041105630807578564, + "learning_rate": 1.996306890665495e-05, + "loss": 0.0016, + "num_tokens": 11425253.0, + "reward": 1.1190476417541504, + "reward_std": 0.33671751618385315, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1190476194024086, + "rewards/fixed_code_pass_all_test_reward/std": 0.33671751618385315, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 244.125, + "completions/mean_terminated_length": 244.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.24976941523704113, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.193359375, + "kl": 0.045710903126746416, + "learning_rate": 1.9962791923205296e-05, + "loss": 0.0018, + "num_tokens": 11433806.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 287.25, + "completions/mean_terminated_length": 287.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.24995388304740823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.07209481485188007, + "learning_rate": 1.9962513906879626e-05, + "loss": 0.0029, + "num_tokens": 11445056.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 328.625, + "completions/mean_terminated_length": 328.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.2501383508577753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.03035471262410283, + "learning_rate": 1.9962234857706768e-05, + "loss": 0.0012, + "num_tokens": 11454501.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 448.75, + "completions/mean_terminated_length": 448.75, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.25032281866814243, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.045521254651248455, + "learning_rate": 1.996195477571565e-05, + "loss": 0.0018, + "num_tokens": 11467267.0, + "reward": 1.5787036418914795, + "reward_std": 0.4952993094921112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5787037014961243, + "rewards/fixed_code_pass_all_test_reward/std": 0.4952993094921112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 314.625, + "completions/mean_terminated_length": 314.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.2505072864785095, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.046875, + "kl": 0.10953079350292683, + "learning_rate": 1.9961673660935304e-05, + "loss": 0.0044, + "num_tokens": 11474400.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 199.5, + "completions/mean_terminated_length": 199.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.2506917542888766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.212890625, + "kl": 0.045237138867378235, + "learning_rate": 1.9961391513394886e-05, + "loss": 0.0018, + "num_tokens": 11479340.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 247.875, + "completions/mean_terminated_length": 247.875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.2508762220992437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.04525521257892251, + "learning_rate": 1.9961108333123634e-05, + "loss": 0.0018, + "num_tokens": 11488115.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 215.0, + "completions/mean_terminated_length": 215.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.2510606899096108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.027870766120031476, + "learning_rate": 1.9960824120150918e-05, + "loss": 0.0011, + "num_tokens": 11492923.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 487.375, + "completions/mean_terminated_length": 487.375, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.25124515771997785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.038976195035502315, + "learning_rate": 1.9960538874506194e-05, + "loss": 0.0016, + "num_tokens": 11502254.0, + "reward": 0.44999998807907104, + "reward_std": 0.8332380652427673, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.37032803893089294, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.251429625530345, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.08797367615625262, + "learning_rate": 1.9960252596219042e-05, + "loss": 0.0035, + "num_tokens": 11510997.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 284.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.25161409334071205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.05915744509547949, + "learning_rate": 1.9959965285319135e-05, + "loss": 0.0024, + "num_tokens": 11520493.0, + "reward": 1.3445945978164673, + "reward_std": 0.26745641231536865, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3445945680141449, + "rewards/fixed_code_pass_all_test_reward/std": 0.26745641231536865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 481.0, + "completions/mean_terminated_length": 257.14288330078125, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.2517985611510791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.62890625, + "kl": 0.06046892097219825, + "learning_rate": 1.9959676941836262e-05, + "loss": 0.0024, + "num_tokens": 11527885.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 294.875, + "completions/mean_terminated_length": 294.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.25198302896144625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.04656099248677492, + "learning_rate": 1.9959387565800314e-05, + "loss": 0.0019, + "num_tokens": 11536924.0, + "reward": 1.90625, + "reward_std": 0.1735912710428238, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1735912710428238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 165.875, + "completions/mean_terminated_length": 165.875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2521674967718133, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.050233818124979734, + "learning_rate": 1.99590971572413e-05, + "loss": 0.002, + "num_tokens": 11540979.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 514.375, + "completions/mean_terminated_length": 514.375, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.2523519645821804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.03871607221662998, + "learning_rate": 1.995880571618932e-05, + "loss": 0.0015, + "num_tokens": 11552446.0, + "reward": 1.1184210777282715, + "reward_std": 0.07309107482433319, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, + "rewards/fixed_code_pass_all_test_reward/std": 0.073091059923172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 171.625, + "completions/mean_terminated_length": 171.625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2525364323925475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.392578125, + "kl": 0.060727344709448516, + "learning_rate": 1.9958513242674588e-05, + "loss": 0.0024, + "num_tokens": 11556667.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 301.625, + "completions/mean_terminated_length": 301.625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.2527209002029146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.06769494875334203, + "learning_rate": 1.9958219736727428e-05, + "loss": 0.0027, + "num_tokens": 11563136.0, + "reward": 1.7037036418914795, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7037037014961243, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 178.75, + "completions/mean_terminated_length": 178.75, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.2529053680132817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.06499261315912008, + "learning_rate": 1.9957925198378273e-05, + "loss": 0.0026, + "num_tokens": 11572094.0, + "reward": 1.101694941520691, + "reward_std": 0.04707556217908859, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10169491171836853, + "rewards/fixed_code_pass_all_test_reward/std": 0.04707559570670128, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 250.875, + "completions/mean_terminated_length": 250.875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.25308983582364875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.059196391608566046, + "learning_rate": 1.9957629627657654e-05, + "loss": 0.0024, + "num_tokens": 11582181.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 374.75, + "completions/mean_terminated_length": 374.75, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.2532743036340159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.046248845756053925, + "learning_rate": 1.9957333024596214e-05, + "loss": 0.0018, + "num_tokens": 11588915.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 138.125, + "completions/mean_terminated_length": 138.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.25345877144438295, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.04817272152286023, + "learning_rate": 1.9957035389224702e-05, + "loss": 0.0019, + "num_tokens": 11592844.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 264.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.25364323925475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.054975228384137154, + "learning_rate": 1.9956736721573974e-05, + "loss": 0.0022, + "num_tokens": 11598226.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 238.125, + "completions/mean_terminated_length": 238.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.25382770706511715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.07083105435594916, + "learning_rate": 1.9956437021675003e-05, + "loss": 0.0028, + "num_tokens": 11605907.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 362.75, + "completions/mean_terminated_length": 362.75, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.2540121748754842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.031956019112840295, + "learning_rate": 1.995613628955885e-05, + "loss": 0.0013, + "num_tokens": 11611577.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 601.75, + "completions/mean_terminated_length": 601.75, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "epoch": 0.2541966426858513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.019758016103878617, + "learning_rate": 1.9955834525256694e-05, + "loss": 0.0008, + "num_tokens": 11623031.0, + "reward": 1.8256173133850098, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9506173133850098, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 209.5, + "completions/mean_terminated_length": 209.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2543811104962184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.06361814518459141, + "learning_rate": 1.9955531728799823e-05, + "loss": 0.0025, + "num_tokens": 11627723.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 360.625, + "completions/mean_terminated_length": 360.625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.2545655783065855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.09679154166951776, + "learning_rate": 1.9955227900219625e-05, + "loss": 0.0039, + "num_tokens": 11635456.0, + "reward": 1.34375, + "reward_std": 0.1388959437608719, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.13889597356319427, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 437.875, + "completions/mean_terminated_length": 437.875, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.25475004611695257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.03198201581835747, + "learning_rate": 1.9954923039547606e-05, + "loss": 0.0013, + "num_tokens": 11648359.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 218.875, + "completions/mean_terminated_length": 218.875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.2549345139273197, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.07485616812482476, + "learning_rate": 1.9954617146815364e-05, + "loss": 0.003, + "num_tokens": 11653166.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 322.25, + "completions/mean_terminated_length": 322.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.25511898173768677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.06592679396271706, + "learning_rate": 1.995431022205462e-05, + "loss": 0.0026, + "num_tokens": 11659560.0, + "reward": 1.2887930870056152, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4137931168079376, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 351.625, + "completions/mean_terminated_length": 351.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.25530344954805384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.08018231624737382, + "learning_rate": 1.9954002265297188e-05, + "loss": 0.0032, + "num_tokens": 11670061.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 298.25, + "completions/mean_terminated_length": 298.25, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.25548791735842097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.044821570510976017, + "learning_rate": 1.9953693276574993e-05, + "loss": 0.0018, + "num_tokens": 11679367.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 193.75, + "completions/mean_terminated_length": 193.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.25567238516878804, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.07324380381032825, + "learning_rate": 1.9953383255920076e-05, + "loss": 0.0029, + "num_tokens": 11688893.0, + "reward": 1.0178570747375488, + "reward_std": 0.05050762742757797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 296.75, + "completions/mean_terminated_length": 296.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.2558568529791551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.053011367097496986, + "learning_rate": 1.995307220336457e-05, + "loss": 0.0021, + "num_tokens": 11694859.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 354.125, + "completions/mean_terminated_length": 354.125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.25604132078952224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.74609375, + "kl": 0.025649540941230953, + "learning_rate": 1.995276011894073e-05, + "loss": 0.001, + "num_tokens": 11702036.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 490.25, + "completions/mean_terminated_length": 490.25, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.2562257885998893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.06322388770058751, + "learning_rate": 1.9952447002680908e-05, + "loss": 0.0025, + "num_tokens": 11712030.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.2564102564102564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.09371439972892404, + "learning_rate": 1.9952132854617567e-05, + "loss": 0.0037, + "num_tokens": 11719100.0, + "reward": 1.7142857313156128, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7142857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 295.875, + "completions/mean_terminated_length": 295.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.2565947242206235, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.0839359606616199, + "learning_rate": 1.9951817674783272e-05, + "loss": 0.0034, + "num_tokens": 11725843.0, + "reward": 1.7946428060531616, + "reward_std": 0.19631260633468628, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7946428060531616, + "rewards/fixed_code_pass_all_test_reward/std": 0.1963125616312027, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 206.75, + "completions/mean_terminated_length": 206.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2567791920309906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19921875, + "kl": 0.06468530022539198, + "learning_rate": 1.9951501463210704e-05, + "loss": 0.0026, + "num_tokens": 11730385.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 228.0, + "completions/mean_terminated_length": 228.0, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.25696365984135766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.07508175959810615, + "learning_rate": 1.995118421993264e-05, + "loss": 0.003, + "num_tokens": 11736345.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 208.75, + "completions/mean_terminated_length": 208.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.2571481276517248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.84765625, + "kl": 0.14588174037635326, + "learning_rate": 1.995086594498197e-05, + "loss": 0.0058, + "num_tokens": 11743943.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 310.0, + "completions/mean_terminated_length": 310.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.25733259546209186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.054404240334406495, + "learning_rate": 1.99505466383917e-05, + "loss": 0.0022, + "num_tokens": 11753287.0, + "reward": 1.2544642686843872, + "reward_std": 0.4078482389450073, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2544642686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.4078482687473297, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.25751706327245893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.0746133872307837, + "learning_rate": 1.9950226300194923e-05, + "loss": 0.003, + "num_tokens": 11759171.0, + "reward": 1.5336538553237915, + "reward_std": 0.488427996635437, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5336538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.488427996635437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 228.25, + "completions/mean_terminated_length": 228.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.25770153108282606, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.06124119693413377, + "learning_rate": 1.9949904930424857e-05, + "loss": 0.0025, + "num_tokens": 11766997.0, + "reward": 1.9598214626312256, + "reward_std": 0.024798767641186714, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9598214626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.024798741564154625, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 194.0, + "completions/mean_terminated_length": 194.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.25788599889319314, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.21875, + "kl": 0.05618661781772971, + "learning_rate": 1.994958252911481e-05, + "loss": 0.0022, + "num_tokens": 11771621.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 260.125, + "completions/mean_terminated_length": 260.125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.2580704667035602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.05733153177425265, + "learning_rate": 1.9949259096298217e-05, + "loss": 0.0023, + "num_tokens": 11777750.0, + "reward": 1.7777777910232544, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 264.375, + "completions/mean_terminated_length": 264.375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.25825493451392734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.04668461834080517, + "learning_rate": 1.99489346320086e-05, + "loss": 0.0019, + "num_tokens": 11782801.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 357.875, + "completions/mean_terminated_length": 357.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2584394023242944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.05440469807945192, + "learning_rate": 1.9948609136279607e-05, + "loss": 0.0022, + "num_tokens": 11790048.0, + "reward": 1.6022727489471436, + "reward_std": 0.32934948801994324, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6022727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.32934945821762085, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 346.875, + "completions/mean_terminated_length": 346.875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.2586238701346615, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.05725490069016814, + "learning_rate": 1.9948282609144975e-05, + "loss": 0.0023, + "num_tokens": 11800759.0, + "reward": 1.98369562625885, + "reward_std": 0.046115659177303314, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9836956262588501, + "rewards/fixed_code_pass_all_test_reward/std": 0.04611567035317421, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 241.0, + "completions/mean_terminated_length": 241.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2588083379450286, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.11624108068645, + "learning_rate": 1.994795505063856e-05, + "loss": 0.0047, + "num_tokens": 11809727.0, + "reward": 1.1458333730697632, + "reward_std": 0.06076743081212044, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333283662796, + "rewards/fixed_code_pass_all_test_reward/std": 0.060767434537410736, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 225.125, + "completions/mean_terminated_length": 225.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.2589928057553957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.11473184637725353, + "learning_rate": 1.994762646079432e-05, + "loss": 0.0046, + "num_tokens": 11818488.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 370.875, + "completions/mean_terminated_length": 370.875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.25917727356576276, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.06093479925766587, + "learning_rate": 1.9947296839646322e-05, + "loss": 0.0024, + "num_tokens": 11826527.0, + "reward": 1.942307710647583, + "reward_std": 0.06818503141403198, + "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.06818501651287079, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.2593617413761299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34765625, + "kl": 0.0812570583075285, + "learning_rate": 1.9946966187228736e-05, + "loss": 0.0033, + "num_tokens": 11830717.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 275.25, + "completions/mean_terminated_length": 275.25, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.25954620918649696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.14377528335899115, + "learning_rate": 1.994663450357585e-05, + "loss": 0.0058, + "num_tokens": 11839911.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 243.125, + "completions/mean_terminated_length": 243.125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.25973067699686403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.03374645160511136, + "learning_rate": 1.994630178872204e-05, + "loss": 0.0013, + "num_tokens": 11844688.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 380.125, + "completions/mean_terminated_length": 380.125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.25991514480723116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7109375, + "kl": 0.16678453609347343, + "learning_rate": 1.9945968042701805e-05, + "loss": 0.0067, + "num_tokens": 11854769.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 311.75, + "completions/mean_terminated_length": 311.75, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.26009961261759823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.03882792126387358, + "learning_rate": 1.9945633265549743e-05, + "loss": 0.0016, + "num_tokens": 11860119.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.2602840804279653, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.030256236786954105, + "learning_rate": 1.9945297457300568e-05, + "loss": 0.0012, + "num_tokens": 11866196.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 212.5, + "completions/mean_terminated_length": 212.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.26046854823833243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.04119424242526293, + "learning_rate": 1.994496061798909e-05, + "loss": 0.0016, + "num_tokens": 11871256.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 209.625, + "completions/mean_terminated_length": 209.625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.2606530160486995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.04438162315636873, + "learning_rate": 1.9944622747650225e-05, + "loss": 0.0018, + "num_tokens": 11875725.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.2608374838590666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.05461907386779785, + "learning_rate": 1.9944283846319012e-05, + "loss": 0.0022, + "num_tokens": 11885060.0, + "reward": 1.4090908765792847, + "reward_std": 0.7256475687026978, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.49896588921546936, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 344.0, + "completions/mean_terminated_length": 344.0, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.2610219516694337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.09730295138433576, + "learning_rate": 1.994394391403058e-05, + "loss": 0.0039, + "num_tokens": 11891956.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 193.0, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.2612064194798008, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.10154067352414131, + "learning_rate": 1.9943602950820167e-05, + "loss": 0.0041, + "num_tokens": 11900364.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 450.75, + "completions/mean_terminated_length": 450.75, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.26139088729016785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.05503968754783273, + "learning_rate": 1.994326095672313e-05, + "loss": 0.0022, + "num_tokens": 11909218.0, + "reward": 1.451923131942749, + "reward_std": 0.19905738532543182, + "rewards/fixed_code_pass_all_test_reward/mean": 0.45192307233810425, + "rewards/fixed_code_pass_all_test_reward/std": 0.19905738532543182, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 332.5, + "completions/mean_terminated_length": 332.5, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.261575355100535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.03780889441259205, + "learning_rate": 1.994291793177492e-05, + "loss": 0.0015, + "num_tokens": 11915950.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 229.5, + "completions/mean_terminated_length": 229.5, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.26175982291090205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.06218882976099849, + "learning_rate": 1.99425738760111e-05, + "loss": 0.0025, + "num_tokens": 11921626.0, + "reward": 1.4583333730697632, + "reward_std": 0.7130230665206909, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.4655483365058899, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 325.375, + "completions/mean_terminated_length": 325.375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.2619442907212691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.05333886807784438, + "learning_rate": 1.9942228789467338e-05, + "loss": 0.0021, + "num_tokens": 11931477.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 207.25, + "completions/mean_terminated_length": 207.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.26212875853163625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.71875, + "kl": 0.18561644107103348, + "learning_rate": 1.9941882672179415e-05, + "loss": 0.0074, + "num_tokens": 11939191.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 180.25, + "completions/mean_terminated_length": 180.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.2623132263420033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.08270745910704136, + "learning_rate": 1.994153552418321e-05, + "loss": 0.0033, + "num_tokens": 11943569.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 205.75, + "completions/mean_terminated_length": 205.75, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2624976941523704, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.05387852247804403, + "learning_rate": 1.9941187345514716e-05, + "loss": 0.0022, + "num_tokens": 11947911.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 311.75, + "completions/mean_terminated_length": 311.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.2626821619627375, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.07131146313622594, + "learning_rate": 1.9940838136210024e-05, + "loss": 0.0029, + "num_tokens": 11958677.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 246.625, + "completions/mean_terminated_length": 246.625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.2628666297731046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11572265625, + "kl": 0.11212560068815947, + "learning_rate": 1.9940487896305346e-05, + "loss": 0.0045, + "num_tokens": 11964618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 238.875, + "completions/mean_terminated_length": 238.875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.26305109758347167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.028809632174670696, + "learning_rate": 1.9940136625836986e-05, + "loss": 0.0012, + "num_tokens": 11970329.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 246.875, + "completions/mean_terminated_length": 246.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2632355653938388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.05921385670080781, + "learning_rate": 1.9939784324841365e-05, + "loss": 0.0024, + "num_tokens": 11978872.0, + "reward": 1.8486111164093018, + "reward_std": 0.20820359885692596, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8486111164093018, + "rewards/fixed_code_pass_all_test_reward/std": 0.20820365846157074, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2634200332042059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.0680231936275959, + "learning_rate": 1.9939430993355005e-05, + "loss": 0.0027, + "num_tokens": 11987145.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 331.0, + "completions/mean_terminated_length": 331.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.26360450101457295, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.061316484585404396, + "learning_rate": 1.993907663141454e-05, + "loss": 0.0025, + "num_tokens": 11994513.0, + "reward": 1.4955357313156128, + "reward_std": 0.2972404360771179, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4955357015132904, + "rewards/fixed_code_pass_all_test_reward/std": 0.2972404360771179, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 258.125, + "completions/mean_terminated_length": 258.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.2637889688249401, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.03558508423157036, + "learning_rate": 1.9938721239056703e-05, + "loss": 0.0014, + "num_tokens": 12003370.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.26397343663530715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.04458526871167123, + "learning_rate": 1.993836481631834e-05, + "loss": 0.0018, + "num_tokens": 12009634.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 325.375, + "completions/mean_terminated_length": 325.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.2641579044456742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.053767523262649775, + "learning_rate": 1.9938007363236405e-05, + "loss": 0.0022, + "num_tokens": 12020829.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 282.75, + "completions/mean_terminated_length": 282.75, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.26434237225604135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.047831503208726645, + "learning_rate": 1.993764887984796e-05, + "loss": 0.0019, + "num_tokens": 12027251.0, + "reward": 1.7840908765792847, + "reward_std": 0.40637627243995667, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7840908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.40637627243995667, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 324.875, + "completions/mean_terminated_length": 324.875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.2645268400664084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.08048037346452475, + "learning_rate": 1.993728936619016e-05, + "loss": 0.0032, + "num_tokens": 12034146.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 338.5, + "completions/mean_terminated_length": 338.5, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.2647113078767755, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.05898463330231607, + "learning_rate": 1.9936928822300285e-05, + "loss": 0.0024, + "num_tokens": 12041662.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 252.875, + "completions/mean_terminated_length": 252.875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.2648957756871426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.392578125, + "kl": 0.12772587314248085, + "learning_rate": 1.9936567248215715e-05, + "loss": 0.0051, + "num_tokens": 12050557.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 296.375, + "completions/mean_terminated_length": 296.375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.2650802434975097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.04071061499416828, + "learning_rate": 1.9936204643973927e-05, + "loss": 0.0016, + "num_tokens": 12057152.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 596.0, + "completions/mean_terminated_length": 388.5714416503906, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.26526471130787677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.06155144365038723, + "learning_rate": 1.993584100961252e-05, + "loss": 0.0025, + "num_tokens": 12066696.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 445.625, + "completions/mean_terminated_length": 445.625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.26544917911824384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.04479340324178338, + "learning_rate": 1.9935476345169192e-05, + "loss": 0.0018, + "num_tokens": 12079637.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 348.5, + "completions/mean_terminated_length": 348.5, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.26563364692861097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03564453125, + "kl": 0.02971238805912435, + "learning_rate": 1.9935110650681747e-05, + "loss": 0.0012, + "num_tokens": 12086665.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 280.375, + "completions/mean_terminated_length": 280.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.26581811473897804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.0264615376945585, + "learning_rate": 1.99347439261881e-05, + "loss": 0.0011, + "num_tokens": 12092524.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 363.875, + "completions/mean_terminated_length": 363.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.2660025825493451, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.04142060875892639, + "learning_rate": 1.993437617172627e-05, + "loss": 0.0017, + "num_tokens": 12100259.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.26618705035971224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.09183123568072915, + "learning_rate": 1.9934007387334386e-05, + "loss": 0.0037, + "num_tokens": 12104756.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 148.375, + "completions/mean_terminated_length": 148.375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.2663715181700793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.08403565781190991, + "learning_rate": 1.9933637573050677e-05, + "loss": 0.0034, + "num_tokens": 12108791.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 171.625, + "completions/mean_terminated_length": 171.625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2665559859804464, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.0812339624390006, + "learning_rate": 1.9933266728913482e-05, + "loss": 0.0032, + "num_tokens": 12113820.0, + "reward": 1.4943182468414307, + "reward_std": 0.21102647483348846, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4943181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.21102647483348846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 439.875, + "completions/mean_terminated_length": 439.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.2667404537908135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.06180181237868965, + "learning_rate": 1.993289485496125e-05, + "loss": 0.0025, + "num_tokens": 12122643.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 335.125, + "completions/mean_terminated_length": 335.125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.2669249216011806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.033306349301710725, + "learning_rate": 1.993252195123254e-05, + "loss": 0.0013, + "num_tokens": 12129276.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.26710938941154766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.038190876599401236, + "learning_rate": 1.9932148017766e-05, + "loss": 0.0015, + "num_tokens": 12138504.0, + "reward": 1.5855262279510498, + "reward_std": 0.16747263073921204, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5855263471603394, + "rewards/fixed_code_pass_all_test_reward/std": 0.16747266054153442, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 230.625, + "completions/mean_terminated_length": 230.625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.2672938572219148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.984375, + "kl": 0.1057237982749939, + "learning_rate": 1.9931773054600402e-05, + "loss": 0.0042, + "num_tokens": 12146317.0, + "reward": 1.8236607313156128, + "reward_std": 0.056821055710315704, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8236607313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.05682109668850899, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 390.875, + "completions/mean_terminated_length": 390.875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.26747832503228186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31640625, + "kl": 0.1133955866098404, + "learning_rate": 1.9931397061774627e-05, + "loss": 0.0045, + "num_tokens": 12154412.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.26766279284264893, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.06177677889354527, + "learning_rate": 1.9931020039327646e-05, + "loss": 0.0025, + "num_tokens": 12163417.0, + "reward": 1.734375, + "reward_std": 0.2868976593017578, + "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2868976593017578, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 641.625, + "completions/mean_terminated_length": 641.625, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "epoch": 0.26784726065301606, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.036903930245898664, + "learning_rate": 1.9930641987298555e-05, + "loss": 0.0015, + "num_tokens": 12177750.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 287.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.26803172846338313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.04377555625978857, + "learning_rate": 1.9930262905726537e-05, + "loss": 0.0018, + "num_tokens": 12183606.0, + "reward": 1.774999976158142, + "reward_std": 0.4200340211391449, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 612.125, + "completions/mean_terminated_length": 612.125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.2682161962737502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66015625, + "kl": 0.022061991039663553, + "learning_rate": 1.99298827946509e-05, + "loss": 0.0009, + "num_tokens": 12195335.0, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 281.5, + "completions/mean_terminated_length": 281.5, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.26840066408411734, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.10417079832404852, + "learning_rate": 1.992950165411105e-05, + "loss": 0.0042, + "num_tokens": 12203611.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 313.75, + "completions/mean_terminated_length": 313.75, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.2685851318944844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.06013980298303068, + "learning_rate": 1.99291194841465e-05, + "loss": 0.0024, + "num_tokens": 12210569.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 455.0, + "completions/mean_terminated_length": 455.0, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.2687695997048515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.020456291851587594, + "learning_rate": 1.992873628479687e-05, + "loss": 0.0008, + "num_tokens": 12219337.0, + "reward": 1.4821429252624512, + "reward_std": 0.27465853095054626, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.27465856075286865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 213.625, + "completions/mean_terminated_length": 213.625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.2689540675152186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.05422973190434277, + "learning_rate": 1.992835205610189e-05, + "loss": 0.0022, + "num_tokens": 12223830.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 164.125, + "completions/mean_terminated_length": 164.125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2691385353255857, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.08057411620393395, + "learning_rate": 1.9927966798101395e-05, + "loss": 0.0032, + "num_tokens": 12228023.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.26932300313595275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.03069179120939225, + "learning_rate": 1.9927580510835326e-05, + "loss": 0.0012, + "num_tokens": 12232448.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 299.75, + "completions/mean_terminated_length": 299.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.2695074709463199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.04510441329330206, + "learning_rate": 1.9927193194343726e-05, + "loss": 0.0018, + "num_tokens": 12238798.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 373.0, + "completions/mean_terminated_length": 373.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.26969193875668696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.04545811784919351, + "learning_rate": 1.9926804848666753e-05, + "loss": 0.0018, + "num_tokens": 12248806.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.26987640656705403, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.08558601979166269, + "learning_rate": 1.992641547384467e-05, + "loss": 0.0034, + "num_tokens": 12258283.0, + "reward": 1.990625023841858, + "reward_std": 0.02651650831103325, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9906250238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.026516500860452652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 224.625, + "completions/mean_terminated_length": 224.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.27006087437742116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.05573216360062361, + "learning_rate": 1.992602506991784e-05, + "loss": 0.0022, + "num_tokens": 12265584.0, + "reward": 1.9166667461395264, + "reward_std": 0.15430331230163574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 357.5, + "completions/mean_terminated_length": 357.5, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.27024534218778823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.06816479470580816, + "learning_rate": 1.992563363692674e-05, + "loss": 0.0027, + "num_tokens": 12272964.0, + "reward": 1.64130437374115, + "reward_std": 0.3421509265899658, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6413043141365051, + "rewards/fixed_code_pass_all_test_reward/std": 0.34215089678764343, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 419.375, + "completions/mean_terminated_length": 419.375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.2704298099981553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.017630112823098898, + "learning_rate": 1.9925241174911957e-05, + "loss": 0.0007, + "num_tokens": 12280991.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 239.375, + "completions/mean_terminated_length": 239.375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.27061427780852243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2138671875, + "kl": 0.09724992886185646, + "learning_rate": 1.9924847683914166e-05, + "loss": 0.0039, + "num_tokens": 12288890.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 120.25, + "completions/mean_terminated_length": 120.25, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.2707987456188895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.232421875, + "kl": 0.0600176730658859, + "learning_rate": 1.9924453163974168e-05, + "loss": 0.0024, + "num_tokens": 12292700.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2709832134292566, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.875, + "kl": 0.08349588373675942, + "learning_rate": 1.992405761513287e-05, + "loss": 0.0033, + "num_tokens": 12298024.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 232.0, + "completions/mean_terminated_length": 232.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.2711676812396237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.08198619168251753, + "learning_rate": 1.992366103743127e-05, + "loss": 0.0033, + "num_tokens": 12305920.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 223.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2713521490499908, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.04752695420756936, + "learning_rate": 1.992326343091049e-05, + "loss": 0.0019, + "num_tokens": 12315452.0, + "reward": 1.75, + "reward_std": 0.3450327515602112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.27153661686035785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.032079927856102586, + "learning_rate": 1.9922864795611746e-05, + "loss": 0.0013, + "num_tokens": 12327045.0, + "reward": 1.543269157409668, + "reward_std": 0.20397306978702545, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5432692766189575, + "rewards/fixed_code_pass_all_test_reward/std": 0.20397311449050903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 291.0, + "completions/mean_terminated_length": 291.0, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.271721084670725, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0390625, + "kl": 0.12552731623873115, + "learning_rate": 1.9922465131576372e-05, + "loss": 0.005, + "num_tokens": 12333333.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 159.875, + "completions/mean_terminated_length": 159.875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.27190555248109205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.04311150568537414, + "learning_rate": 1.9922064438845793e-05, + "loss": 0.0017, + "num_tokens": 12337620.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 282.375, + "completions/mean_terminated_length": 282.375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.2720900202914591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662109375, + "kl": 0.017899402882903814, + "learning_rate": 1.992166271746156e-05, + "loss": 0.0007, + "num_tokens": 12346079.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 246.625, + "completions/mean_terminated_length": 246.625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.27227448810182625, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.032958275405690074, + "learning_rate": 1.9921259967465318e-05, + "loss": 0.0013, + "num_tokens": 12351572.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 139.75, + "completions/mean_terminated_length": 139.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.2724589559121933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1767578125, + "kl": 0.06398858223110437, + "learning_rate": 1.9920856188898817e-05, + "loss": 0.0026, + "num_tokens": 12355498.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 212.0, + "completions/mean_terminated_length": 212.0, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.2726434237225604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.036508833640255034, + "learning_rate": 1.9920451381803922e-05, + "loss": 0.0015, + "num_tokens": 12366130.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 248.75, + "completions/mean_terminated_length": 248.75, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.2728278915329275, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.21875, + "kl": 0.040717258816584945, + "learning_rate": 1.9920045546222598e-05, + "loss": 0.0016, + "num_tokens": 12374760.0, + "reward": 1.3977272510528564, + "reward_std": 0.8637218475341797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.4019947648048401, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 284.25, + "completions/mean_terminated_length": 284.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.2730123593432946, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.04609573679044843, + "learning_rate": 1.9919638682196926e-05, + "loss": 0.0018, + "num_tokens": 12380570.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 279.25, + "completions/mean_terminated_length": 279.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.27319682715366167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.0416278587654233, + "learning_rate": 1.9919230789769078e-05, + "loss": 0.0017, + "num_tokens": 12389980.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 155.75, + "completions/mean_terminated_length": 155.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.2733812949640288, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.050751710776239634, + "learning_rate": 1.9918821868981347e-05, + "loss": 0.002, + "num_tokens": 12394122.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 219.125, + "completions/mean_terminated_length": 219.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.27356576277439587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1572265625, + "kl": 0.07614199840463698, + "learning_rate": 1.9918411919876126e-05, + "loss": 0.003, + "num_tokens": 12401571.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 420.75, + "completions/mean_terminated_length": 420.75, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.27375023058476294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.11692187888547778, + "learning_rate": 1.9918000942495913e-05, + "loss": 0.0047, + "num_tokens": 12410729.0, + "reward": 1.7149999141693115, + "reward_std": 0.4087262749671936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7149999737739563, + "rewards/fixed_code_pass_all_test_reward/std": 0.4087262451648712, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 203.375, + "completions/mean_terminated_length": 203.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.27393469839513007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.052711405558511615, + "learning_rate": 1.9917588936883323e-05, + "loss": 0.0021, + "num_tokens": 12418948.0, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.27411916620549714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.055742616998031735, + "learning_rate": 1.991717590308106e-05, + "loss": 0.0022, + "num_tokens": 12423949.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 243.25, + "completions/mean_terminated_length": 243.25, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.2743036340158642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.03949356428347528, + "learning_rate": 1.9916761841131952e-05, + "loss": 0.0016, + "num_tokens": 12433415.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.27448810182623135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.06568608293309808, + "learning_rate": 1.9916346751078924e-05, + "loss": 0.0026, + "num_tokens": 12441234.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 297.5, + "completions/mean_terminated_length": 297.5, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.2746725696365984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4765625, + "kl": 0.12739248387515545, + "learning_rate": 1.991593063296501e-05, + "loss": 0.0051, + "num_tokens": 12450870.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 222.5, + "completions/mean_terminated_length": 222.5, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.2748570374469655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.0613123734947294, + "learning_rate": 1.991551348683335e-05, + "loss": 0.0025, + "num_tokens": 12456810.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 260.875, + "completions/mean_terminated_length": 260.875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.2750415052573326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.0737862482201308, + "learning_rate": 1.991509531272719e-05, + "loss": 0.003, + "num_tokens": 12462057.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 212.375, + "completions/mean_terminated_length": 212.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2752259730676997, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.055398568976670504, + "learning_rate": 1.9914676110689888e-05, + "loss": 0.0022, + "num_tokens": 12470612.0, + "reward": 1.6153846979141235, + "reward_std": 0.26004746556282043, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6153846383094788, + "rewards/fixed_code_pass_all_test_reward/std": 0.26004746556282043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 282.0, + "completions/mean_terminated_length": 282.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.27541044087806676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.059104107320308685, + "learning_rate": 1.99142558807649e-05, + "loss": 0.0024, + "num_tokens": 12477164.0, + "reward": 1.8461538553237915, + "reward_std": 0.28486770391464233, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.28486770391464233, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.2755949086884339, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.048838973976671696, + "learning_rate": 1.9913834622995787e-05, + "loss": 0.002, + "num_tokens": 12484795.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 226.0, + "completions/mean_terminated_length": 226.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.27577937649880097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.04624909209087491, + "learning_rate": 1.9913412337426235e-05, + "loss": 0.0018, + "num_tokens": 12489499.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 326.75, + "completions/mean_terminated_length": 326.75, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.27596384430916804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.028489691205322742, + "learning_rate": 1.9912989024100016e-05, + "loss": 0.0011, + "num_tokens": 12496433.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 408.125, + "completions/mean_terminated_length": 408.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.27614831211953517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.03229889879003167, + "learning_rate": 1.9912564683061014e-05, + "loss": 0.0013, + "num_tokens": 12504722.0, + "reward": 1.1750000715255737, + "reward_std": 0.04629099741578102, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.04629100486636162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 197.5, + "completions/mean_terminated_length": 197.5, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.27633277992990224, + "frac_reward_zero_std": 1.0, + "grad_norm": 4.5625, + "kl": 0.4583804849535227, + "learning_rate": 1.991213931435323e-05, + "loss": 0.0183, + "num_tokens": 12512622.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.2765172477402693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.052784796338528395, + "learning_rate": 1.9911712918020756e-05, + "loss": 0.0021, + "num_tokens": 12517170.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.27670171555063644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.1051879683509469, + "learning_rate": 1.99112854941078e-05, + "loss": 0.0042, + "num_tokens": 12527653.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.2768861833610035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.044017903972417116, + "learning_rate": 1.9910857042658675e-05, + "loss": 0.0018, + "num_tokens": 12531830.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 282.25, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2770706511713706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.04962373268790543, + "learning_rate": 1.9910427563717803e-05, + "loss": 0.002, + "num_tokens": 12537544.0, + "reward": 1.7916667461395264, + "reward_std": 0.2480079084634781, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.24800792336463928, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 242.875, + "completions/mean_terminated_length": 242.875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.2772551189817377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.06979273306205869, + "learning_rate": 1.9909997057329703e-05, + "loss": 0.0028, + "num_tokens": 12545959.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 200.625, + "completions/mean_terminated_length": 200.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.2774395867921048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.0593261937610805, + "learning_rate": 1.9909565523539017e-05, + "loss": 0.0024, + "num_tokens": 12553764.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 244.5, + "completions/mean_terminated_length": 244.5, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.27762405460247186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28125, + "kl": 0.16169606428593397, + "learning_rate": 1.9909132962390472e-05, + "loss": 0.0065, + "num_tokens": 12564800.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.27780852241283893, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.921875, + "kl": 0.13693272415548563, + "learning_rate": 1.990869937392892e-05, + "loss": 0.0055, + "num_tokens": 12573911.0, + "reward": 0.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 241.625, + "completions/mean_terminated_length": 241.625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.27799299022320606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2236328125, + "kl": 0.06877797958441079, + "learning_rate": 1.990826475819931e-05, + "loss": 0.0028, + "num_tokens": 12581196.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.27817745803357313, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.09001912456005812, + "learning_rate": 1.99078291152467e-05, + "loss": 0.0036, + "num_tokens": 12588807.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 244.125, + "completions/mean_terminated_length": 244.125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2783619258439402, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.03797531977761537, + "learning_rate": 1.9907392445116258e-05, + "loss": 0.0015, + "num_tokens": 12598224.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 225.5, + "completions/mean_terminated_length": 225.5, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.27854639365430733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.032905344502069056, + "learning_rate": 1.990695474785325e-05, + "loss": 0.0013, + "num_tokens": 12603452.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 214.125, + "completions/mean_terminated_length": 214.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2787308614646744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.047405785182490945, + "learning_rate": 1.9906516023503057e-05, + "loss": 0.0019, + "num_tokens": 12608493.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 421.0, + "completions/mean_terminated_length": 421.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.2789153292750415, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.06798795820213854, + "learning_rate": 1.9906076272111164e-05, + "loss": 0.0027, + "num_tokens": 12618581.0, + "reward": 1.109195351600647, + "reward_std": 0.11345508694648743, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10919541120529175, + "rewards/fixed_code_pass_all_test_reward/std": 0.11345507204532623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 370.25, + "completions/mean_terminated_length": 370.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.2790997970854086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.07963244523853064, + "learning_rate": 1.9905635493723156e-05, + "loss": 0.0032, + "num_tokens": 12625431.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 420.625, + "completions/mean_terminated_length": 420.625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.2792842648957757, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.04991452815011144, + "learning_rate": 1.9905193688384735e-05, + "loss": 0.002, + "num_tokens": 12635172.0, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 261.125, + "completions/mean_terminated_length": 261.125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.27946873270614275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.033029105281457305, + "learning_rate": 1.9904750856141705e-05, + "loss": 0.0013, + "num_tokens": 12640413.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 232.25, + "completions/mean_terminated_length": 232.25, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.2796532005165099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.044434914947487414, + "learning_rate": 1.9904306997039973e-05, + "loss": 0.0018, + "num_tokens": 12647919.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.27983766832687695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.051305771339684725, + "learning_rate": 1.9903862111125556e-05, + "loss": 0.0021, + "num_tokens": 12652075.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 135.625, + "completions/mean_terminated_length": 135.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.280022136137244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.203125, + "kl": 0.05507510039024055, + "learning_rate": 1.9903416198444577e-05, + "loss": 0.0022, + "num_tokens": 12655896.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 254.375, + "completions/mean_terminated_length": 254.375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.28020660394761115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.08361389115452766, + "learning_rate": 1.9902969259043266e-05, + "loss": 0.0033, + "num_tokens": 12664859.0, + "reward": 1.6728723049163818, + "reward_std": 0.46669280529022217, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6728723049163818, + "rewards/fixed_code_pass_all_test_reward/std": 0.46669283509254456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 334.875, + "completions/mean_terminated_length": 334.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.2803910717579782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.08538082288578153, + "learning_rate": 1.9902521292967956e-05, + "loss": 0.0034, + "num_tokens": 12673546.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 309.0, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.2805755395683453, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.09065414126962423, + "learning_rate": 1.9902072300265093e-05, + "loss": 0.0036, + "num_tokens": 12683890.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.28076000737871243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.04352068458683789, + "learning_rate": 1.9901622280981225e-05, + "loss": 0.0017, + "num_tokens": 12688800.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 317.125, + "completions/mean_terminated_length": 317.125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.2809444751890795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.04294374352321029, + "learning_rate": 1.9901171235163006e-05, + "loss": 0.0017, + "num_tokens": 12695713.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 311.875, + "completions/mean_terminated_length": 311.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.2811289429994466, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.10341498721390963, + "learning_rate": 1.9900719162857195e-05, + "loss": 0.0041, + "num_tokens": 12706752.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 323.125, + "completions/mean_terminated_length": 323.125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.2813134108098137, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.03259825287386775, + "learning_rate": 1.9900266064110664e-05, + "loss": 0.0013, + "num_tokens": 12713649.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 442.25, + "completions/mean_terminated_length": 442.25, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.2814978786201808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.06894711637869477, + "learning_rate": 1.9899811938970383e-05, + "loss": 0.0028, + "num_tokens": 12722099.0, + "reward": 1.1875, + "reward_std": 0.752970278263092, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.28168234643054785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.034998948336578906, + "learning_rate": 1.989935678748344e-05, + "loss": 0.0014, + "num_tokens": 12726919.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 215.875, + "completions/mean_terminated_length": 215.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.281866814240915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.03867588611319661, + "learning_rate": 1.989890060969701e-05, + "loss": 0.0015, + "num_tokens": 12734230.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 217.5, + "completions/mean_terminated_length": 217.5, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.28205128205128205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.024953163811005652, + "learning_rate": 1.98984434056584e-05, + "loss": 0.001, + "num_tokens": 12739538.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 179.875, + "completions/mean_terminated_length": 179.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.2822357498616491, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.05601682420819998, + "learning_rate": 1.9897985175414998e-05, + "loss": 0.0022, + "num_tokens": 12746713.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 407.0, + "completions/mean_terminated_length": 407.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.28242021767201625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73046875, + "kl": 0.01990097528323531, + "learning_rate": 1.9897525919014318e-05, + "loss": 0.0008, + "num_tokens": 12756225.0, + "reward": 1.9166667461395264, + "reward_std": 0.15430331230163574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 215.625, + "completions/mean_terminated_length": 215.625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.2826046854823833, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.04775161948055029, + "learning_rate": 1.9897065636503973e-05, + "loss": 0.0019, + "num_tokens": 12760950.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 434.5, + "completions/mean_terminated_length": 434.5, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.2827891532927504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.05505303223617375, + "learning_rate": 1.9896604327931675e-05, + "loss": 0.0022, + "num_tokens": 12770234.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 283.375, + "completions/mean_terminated_length": 283.375, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.2829736211031175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.031462120881769806, + "learning_rate": 1.9896141993345254e-05, + "loss": 0.0013, + "num_tokens": 12776021.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 430.875, + "completions/mean_terminated_length": 430.875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.2831580889134846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.05828576651401818, + "learning_rate": 1.9895678632792646e-05, + "loss": 0.0023, + "num_tokens": 12786076.0, + "reward": 1.419471025466919, + "reward_std": 0.44844457507133484, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5444711446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.45415154099464417, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 312.625, + "completions/mean_terminated_length": 312.625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.28334255672385167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.01509854116011411, + "learning_rate": 1.989521424632188e-05, + "loss": 0.0006, + "num_tokens": 12792161.0, + "reward": 1.7000000476837158, + "reward_std": 0.32071349024772644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.699999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.32071349024772644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.2835270245342188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.06230771308764815, + "learning_rate": 1.9894748833981107e-05, + "loss": 0.0025, + "num_tokens": 12815175.0, + "reward": 1.6731927394866943, + "reward_std": 0.30871447920799255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6731927394866943, + "rewards/fixed_code_pass_all_test_reward/std": 0.30871447920799255, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 199.75, + "completions/mean_terminated_length": 199.75, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.28371149234458587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.04396213870495558, + "learning_rate": 1.989428239581858e-05, + "loss": 0.0018, + "num_tokens": 12819597.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 498.625, + "completions/mean_terminated_length": 498.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.28389596015495294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.04362081678118557, + "learning_rate": 1.9893814931882647e-05, + "loss": 0.0017, + "num_tokens": 12830170.0, + "reward": 1.504807710647583, + "reward_std": 0.6686668395996094, + "rewards/fixed_code_pass_all_test_reward/mean": 0.629807710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.3770548105239868, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 189.875, + "completions/mean_terminated_length": 189.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.28408042796532007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.03300483478233218, + "learning_rate": 1.989334644222178e-05, + "loss": 0.0013, + "num_tokens": 12834745.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 488.375, + "completions/mean_terminated_length": 488.375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.28426489577568714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.06385804596357048, + "learning_rate": 1.9892876926884544e-05, + "loss": 0.0026, + "num_tokens": 12848596.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 511.75, + "completions/mean_terminated_length": 511.75, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.2844493635860542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.048254032619297504, + "learning_rate": 1.9892406385919618e-05, + "loss": 0.0019, + "num_tokens": 12863626.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 365.25, + "completions/mean_terminated_length": 365.25, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.28463383139642134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.0624229870736599, + "learning_rate": 1.989193481937578e-05, + "loss": 0.0025, + "num_tokens": 12871020.0, + "reward": 1.85326087474823, + "reward_std": 0.18583010137081146, + "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, + "rewards/fixed_code_pass_all_test_reward/std": 0.18583005666732788, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 358.0, + "completions/mean_terminated_length": 358.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.2848182992067884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.0365498848259449, + "learning_rate": 1.989146222730193e-05, + "loss": 0.0015, + "num_tokens": 12881876.0, + "reward": 1.6328125, + "reward_std": 0.7368168234825134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7578125, + "rewards/fixed_code_pass_all_test_reward/std": 0.4487551152706146, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 402.0, + "completions/mean_terminated_length": 402.0, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.2850027670171555, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.044092579977586865, + "learning_rate": 1.989098860974705e-05, + "loss": 0.0018, + "num_tokens": 12889892.0, + "reward": 1.295454502105713, + "reward_std": 0.06428244709968567, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 258.0, + "completions/mean_terminated_length": 258.0, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2851872348275226, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.03914031758904457, + "learning_rate": 1.9890513966760246e-05, + "loss": 0.0016, + "num_tokens": 12897500.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 480.0, + "completions/mean_terminated_length": 480.0, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.2853717026378897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.03119426465127617, + "learning_rate": 1.989003829839073e-05, + "loss": 0.0012, + "num_tokens": 12908380.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 330.0, + "completions/mean_terminated_length": 330.0, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.28555617044825676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.04854785185307264, + "learning_rate": 1.9889561604687812e-05, + "loss": 0.0019, + "num_tokens": 12917580.0, + "reward": 1.650240421295166, + "reward_std": 0.34072890877723694, + "rewards/fixed_code_pass_all_test_reward/mean": 0.650240421295166, + "rewards/fixed_code_pass_all_test_reward/std": 0.34072890877723694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 153.625, + "completions/mean_terminated_length": 153.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.2857406382586239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.02883279079105705, + "learning_rate": 1.9889083885700912e-05, + "loss": 0.0012, + "num_tokens": 12921745.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 316.125, + "completions/mean_terminated_length": 316.125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.28592510606899096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.048334666062146425, + "learning_rate": 1.9888605141479562e-05, + "loss": 0.0019, + "num_tokens": 12932194.0, + "reward": 1.6875, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 633.5, + "completions/mean_terminated_length": 633.5, + "completions/min_length": 522.0, + "completions/min_terminated_length": 522.0, + "epoch": 0.28610957387935804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.02496644592611119, + "learning_rate": 1.988812537207339e-05, + "loss": 0.001, + "num_tokens": 12951134.0, + "reward": 1.8333332538604736, + "reward_std": 0.34503278136253357, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.34503278136253357, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 302.75, + "completions/mean_terminated_length": 302.75, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.28629404168972516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.048403201857581735, + "learning_rate": 1.9887644577532135e-05, + "loss": 0.0019, + "num_tokens": 12957676.0, + "reward": 1.3421052694320679, + "reward_std": 0.3107365071773529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34210526943206787, + "rewards/fixed_code_pass_all_test_reward/std": 0.3107365071773529, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.28647850950009224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.051626923494040966, + "learning_rate": 1.9887162757905644e-05, + "loss": 0.0021, + "num_tokens": 12962156.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1793.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 632.5, + "completions/mean_terminated_length": 632.5, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.2866629773104593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73046875, + "kl": 0.032839380437508225, + "learning_rate": 1.9886679913243873e-05, + "loss": 0.0013, + "num_tokens": 12973320.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 281.625, + "completions/mean_terminated_length": 281.625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.28684744512082644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.05341592035256326, + "learning_rate": 1.9886196043596872e-05, + "loss": 0.0021, + "num_tokens": 12983877.0, + "reward": 1.90625, + "reward_std": 0.13567514717578888, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.13567513227462769, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 409.625, + "completions/mean_terminated_length": 409.625, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.2870319129311935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2314453125, + "kl": 0.05163940321654081, + "learning_rate": 1.9885711149014812e-05, + "loss": 0.0021, + "num_tokens": 12992330.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 306.875, + "completions/mean_terminated_length": 306.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.2872163807415606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.060778662795200944, + "learning_rate": 1.9885225229547957e-05, + "loss": 0.0024, + "num_tokens": 13001833.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.2874008485519277, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90234375, + "kl": 0.04718358791433275, + "learning_rate": 1.9884738285246694e-05, + "loss": 0.0019, + "num_tokens": 13011285.0, + "reward": 1.8365384340286255, + "reward_std": 0.3088919222354889, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8365384340286255, + "rewards/fixed_code_pass_all_test_reward/std": 0.3088919222354889, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 342.5, + "completions/mean_terminated_length": 342.5, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.2875853163622948, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.051910422393120825, + "learning_rate": 1.9884250316161494e-05, + "loss": 0.0021, + "num_tokens": 13023769.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.28776978417266186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.06802480923943222, + "learning_rate": 1.988376132234296e-05, + "loss": 0.0027, + "num_tokens": 13032137.0, + "reward": 1.33984375, + "reward_std": 0.2861368656158447, + "rewards/fixed_code_pass_all_test_reward/mean": 0.33984375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2861368656158447, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 191.25, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.287954251983029, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.036979639902710915, + "learning_rate": 1.9883271303841774e-05, + "loss": 0.0015, + "num_tokens": 13036539.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 170.0, + "completions/mean_terminated_length": 170.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.28813871979339606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.04171308968216181, + "learning_rate": 1.9882780260708746e-05, + "loss": 0.0017, + "num_tokens": 13040803.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 387.5, + "completions/mean_terminated_length": 387.5, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.28832318760376313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.03706836479250342, + "learning_rate": 1.988228819299478e-05, + "loss": 0.0015, + "num_tokens": 13048543.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 317.875, + "completions/mean_terminated_length": 317.875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.28850765541413026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.02440270478837192, + "learning_rate": 1.9881795100750896e-05, + "loss": 0.001, + "num_tokens": 13055014.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 504.75, + "completions/mean_terminated_length": 504.75, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.28869212322449733, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.044012056197971106, + "learning_rate": 1.9881300984028213e-05, + "loss": 0.0018, + "num_tokens": 13068444.0, + "reward": 1.115384578704834, + "reward_std": 0.5577396750450134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.24038462340831757, + "rewards/fixed_code_pass_all_test_reward/std": 0.3426254987716675, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 206.0, + "completions/mean_terminated_length": 206.0, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.2888765910348644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.05059381015598774, + "learning_rate": 1.988080584287795e-05, + "loss": 0.002, + "num_tokens": 13075460.0, + "reward": 1.8611111640930176, + "reward_std": 0.19168488681316376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8611111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.19168488681316376, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 405.5, + "completions/mean_terminated_length": 405.5, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.28906105884523153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.040906468871980906, + "learning_rate": 1.988030967735145e-05, + "loss": 0.0016, + "num_tokens": 13087800.0, + "reward": 1.3185484409332275, + "reward_std": 0.17107422649860382, + "rewards/fixed_code_pass_all_test_reward/mean": 0.31854838132858276, + "rewards/fixed_code_pass_all_test_reward/std": 0.17107422649860382, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 276.125, + "completions/mean_terminated_length": 276.125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.2892455266555986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.08860543975606561, + "learning_rate": 1.9879812487500145e-05, + "loss": 0.0035, + "num_tokens": 13099297.0, + "reward": 1.274999976158142, + "reward_std": 0.4527692198753357, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.5014265775680542, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 282.625, + "completions/mean_terminated_length": 282.625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.2894299944659657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.054959756787866354, + "learning_rate": 1.9879314273375584e-05, + "loss": 0.0022, + "num_tokens": 13105470.0, + "reward": 1.84375, + "reward_std": 0.35197147727012634, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 159.25, + "completions/mean_terminated_length": 159.25, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.2896144622763328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.041789666283875704, + "learning_rate": 1.9878815035029418e-05, + "loss": 0.0017, + "num_tokens": 13109600.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 339.0, + "completions/mean_terminated_length": 339.0, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.2897989300866999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.04521609912626445, + "learning_rate": 1.9878314772513405e-05, + "loss": 0.0018, + "num_tokens": 13121144.0, + "reward": 1.6875, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 216.375, + "completions/mean_terminated_length": 216.375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.28998339789706695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.05744500714354217, + "learning_rate": 1.987781348587941e-05, + "loss": 0.0023, + "num_tokens": 13128563.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 432.125, + "completions/mean_terminated_length": 432.125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.290167865707434, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.03745120466919616, + "learning_rate": 1.9877311175179397e-05, + "loss": 0.0015, + "num_tokens": 13139388.0, + "reward": 1.5625, + "reward_std": 0.4955156147480011, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 226.75, + "completions/mean_terminated_length": 226.75, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.29035233351780115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11669921875, + "kl": 0.09419935662299395, + "learning_rate": 1.987680784046545e-05, + "loss": 0.0038, + "num_tokens": 13146946.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.2905368013281682, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.03891092468984425, + "learning_rate": 1.9876303481789745e-05, + "loss": 0.0016, + "num_tokens": 13151855.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 244.625, + "completions/mean_terminated_length": 244.625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.2907212691385353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.10609187046065927, + "learning_rate": 1.9875798099204575e-05, + "loss": 0.0042, + "num_tokens": 13161428.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 256.625, + "completions/mean_terminated_length": 256.625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.2909057369489024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.0602632244117558, + "learning_rate": 1.9875291692762336e-05, + "loss": 0.0024, + "num_tokens": 13169465.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.2910902047592695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.070282943546772, + "learning_rate": 1.9874784262515522e-05, + "loss": 0.0028, + "num_tokens": 13176810.0, + "reward": 1.5875000953674316, + "reward_std": 0.3482097089290619, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5874999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.3482097089290619, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 356.5, + "completions/mean_terminated_length": 356.5, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.2912746725696366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.048758917255327106, + "learning_rate": 1.9874275808516745e-05, + "loss": 0.002, + "num_tokens": 13186398.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 382.375, + "completions/mean_terminated_length": 382.375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.2914591403800037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.04576763557270169, + "learning_rate": 1.987376633081872e-05, + "loss": 0.0018, + "num_tokens": 13198649.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 219.25, + "completions/mean_terminated_length": 219.25, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.2916436081903708, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.06209050607867539, + "learning_rate": 1.9873255829474258e-05, + "loss": 0.0025, + "num_tokens": 13210083.0, + "reward": 1.0367647409439087, + "reward_std": 0.08545470237731934, + "rewards/fixed_code_pass_all_test_reward/mean": 0.036764707416296005, + "rewards/fixed_code_pass_all_test_reward/std": 0.08545467257499695, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 187.375, + "completions/mean_terminated_length": 187.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.29182807600073785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2353515625, + "kl": 0.06386336800642312, + "learning_rate": 1.9872744304536294e-05, + "loss": 0.0026, + "num_tokens": 13214406.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 165.375, + "completions/mean_terminated_length": 165.375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.292012543811105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.09755423618480563, + "learning_rate": 1.9872231756057855e-05, + "loss": 0.0039, + "num_tokens": 13222841.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.29219701162147205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.0826618371065706, + "learning_rate": 1.9871718184092078e-05, + "loss": 0.0033, + "num_tokens": 13231959.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 548.375, + "completions/mean_terminated_length": 548.375, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "epoch": 0.2923814794318391, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.042786670150235295, + "learning_rate": 1.987120358869221e-05, + "loss": 0.0017, + "num_tokens": 13242650.0, + "reward": 1.0178570747375488, + "reward_std": 0.05050762742757797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.29256594724220625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.05961371725425124, + "learning_rate": 1.9870687969911597e-05, + "loss": 0.0024, + "num_tokens": 13251068.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 297.0, + "completions/mean_terminated_length": 297.0, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.2927504150525733, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.07194199599325657, + "learning_rate": 1.9870171327803694e-05, + "loss": 0.0029, + "num_tokens": 13260572.0, + "reward": 1.5255101919174194, + "reward_std": 0.3938828110694885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5255101919174194, + "rewards/fixed_code_pass_all_test_reward/std": 0.3938828408718109, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 767.75, + "completions/mean_terminated_length": 767.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.2929348828629404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.54296875, + "kl": 0.04116517829243094, + "learning_rate": 1.986965366242207e-05, + "loss": 0.0016, + "num_tokens": 13275442.0, + "reward": 1.0, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.2931193506733075, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.05867267865687609, + "learning_rate": 1.986913497382039e-05, + "loss": 0.0023, + "num_tokens": 13286532.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.2933038184836746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.07635619444772601, + "learning_rate": 1.986861526205242e-05, + "loss": 0.0031, + "num_tokens": 13296521.0, + "reward": 1.9027777910232544, + "reward_std": 0.2749859392642975, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 224.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.29348828629404167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.18401164468377829, + "learning_rate": 1.986809452717205e-05, + "loss": 0.0074, + "num_tokens": 13305239.0, + "reward": 1.4728260040283203, + "reward_std": 0.33391231298446655, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5978260636329651, + "rewards/fixed_code_pass_all_test_reward/std": 0.1639210432767868, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 164.625, + "completions/mean_terminated_length": 164.625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.2936727541044088, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.2538868556730449, + "learning_rate": 1.9867572769233262e-05, + "loss": 0.0102, + "num_tokens": 13312380.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 207.25, + "completions/mean_terminated_length": 207.25, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.29385722191477587, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.08793225674889982, + "learning_rate": 1.9867049988290154e-05, + "loss": 0.0035, + "num_tokens": 13321022.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 341.75, + "completions/mean_terminated_length": 341.75, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.29404168972514294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.423828125, + "kl": 0.10111156711354852, + "learning_rate": 1.9866526184396916e-05, + "loss": 0.004, + "num_tokens": 13331196.0, + "reward": 1.0714285373687744, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 305.75, + "completions/mean_terminated_length": 305.75, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.29422615753551007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.061437806114554405, + "learning_rate": 1.986600135760786e-05, + "loss": 0.0025, + "num_tokens": 13338418.0, + "reward": 1.4800000190734863, + "reward_std": 0.440778523683548, + "rewards/fixed_code_pass_all_test_reward/mean": 0.47999998927116394, + "rewards/fixed_code_pass_all_test_reward/std": 0.440778523683548, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 104.625, + "completions/mean_terminated_length": 104.625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.29441062534587714, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.90625, + "kl": 0.11072788201272488, + "learning_rate": 1.986547550797739e-05, + "loss": 0.0044, + "num_tokens": 13342239.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 204.125, + "completions/mean_terminated_length": 204.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.2945950931562442, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.08745036227628589, + "learning_rate": 1.986494863556003e-05, + "loss": 0.0035, + "num_tokens": 13350088.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 222.125, + "completions/mean_terminated_length": 222.125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.29477956096661134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.0520627856021747, + "learning_rate": 1.9864420740410395e-05, + "loss": 0.0021, + "num_tokens": 13359913.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 188.5, + "completions/mean_terminated_length": 188.5, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.2949640287769784, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.02589505910873413, + "learning_rate": 1.986389182258322e-05, + "loss": 0.001, + "num_tokens": 13364973.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 214.125, + "completions/mean_terminated_length": 214.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.2951484965873455, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.11187402484938502, + "learning_rate": 1.9863361882133332e-05, + "loss": 0.0045, + "num_tokens": 13375822.0, + "reward": 1.2727272510528564, + "reward_std": 0.16833092272281647, + "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, + "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 153.0, + "completions/mean_terminated_length": 153.0, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.2953329643977126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.06600933731533587, + "learning_rate": 1.9862830919115683e-05, + "loss": 0.0026, + "num_tokens": 13380062.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 232.0, + "completions/mean_terminated_length": 232.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.2955174322080797, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.08473279420286417, + "learning_rate": 1.9862298933585307e-05, + "loss": 0.0034, + "num_tokens": 13386206.0, + "reward": 0.8914834856987, + "reward_std": 0.787941038608551, + "rewards/fixed_code_pass_all_test_reward/mean": 0.26648351550102234, + "rewards/fixed_code_pass_all_test_reward/std": 0.35295820236206055, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 140.0, + "completions/max_terminated_length": 140.0, + "completions/mean_length": 101.125, + "completions/mean_terminated_length": 101.125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.29570190001844676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.07852287101559341, + "learning_rate": 1.9861765925597365e-05, + "loss": 0.0031, + "num_tokens": 13389831.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 151.375, + "completions/mean_terminated_length": 151.375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.2958863678288139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.05218710470944643, + "learning_rate": 1.9861231895207116e-05, + "loss": 0.0021, + "num_tokens": 13394178.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 156.125, + "completions/mean_terminated_length": 156.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.29607083563918096, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.14273787289857864, + "learning_rate": 1.9860696842469923e-05, + "loss": 0.0057, + "num_tokens": 13399235.0, + "reward": 1.6124999523162842, + "reward_std": 0.44860896468162537, + "rewards/fixed_code_pass_all_test_reward/mean": 0.612500011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.44860899448394775, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 354.75, + "completions/mean_terminated_length": 354.75, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.29625530344954804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.032891144044697285, + "learning_rate": 1.9860160767441253e-05, + "loss": 0.0013, + "num_tokens": 13407329.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 109.125, + "completions/mean_terminated_length": 109.125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.29643977125991516, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.828125, + "kl": 0.04199417436029762, + "learning_rate": 1.9859623670176688e-05, + "loss": 0.0017, + "num_tokens": 13410930.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 206.375, + "completions/mean_terminated_length": 206.375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.29662423907028224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.07756194565445185, + "learning_rate": 1.9859085550731905e-05, + "loss": 0.0031, + "num_tokens": 13422893.0, + "reward": 1.4362244606018066, + "reward_std": 0.32797372341156006, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43622449040412903, + "rewards/fixed_code_pass_all_test_reward/std": 0.32797375321388245, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 514.375, + "completions/mean_terminated_length": 514.375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.2968087068806493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6484375, + "kl": 0.021326712623704225, + "learning_rate": 1.9858546409162696e-05, + "loss": 0.0009, + "num_tokens": 13436080.0, + "reward": 1.9629629850387573, + "reward_std": 0.022859742864966393, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9629629850387573, + "rewards/fixed_code_pass_all_test_reward/std": 0.022859742864966393, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.29699317469101644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.14404586819000542, + "learning_rate": 1.985800624552496e-05, + "loss": 0.0058, + "num_tokens": 13446416.0, + "reward": 1.435185194015503, + "reward_std": 0.21413865685462952, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43518519401550293, + "rewards/fixed_code_pass_all_test_reward/std": 0.2141387015581131, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 165.0, + "completions/mean_terminated_length": 165.0, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.2971776425013835, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.134765625, + "kl": 0.06881027901545167, + "learning_rate": 1.985746505987469e-05, + "loss": 0.0028, + "num_tokens": 13453296.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 258.75, + "completions/mean_terminated_length": 258.75, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.2973621103117506, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.05991009762510657, + "learning_rate": 1.9856922852268e-05, + "loss": 0.0024, + "num_tokens": 13461382.0, + "reward": 0.75, + "reward_std": 1.0350983142852783, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.375, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.2975465781221177, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.09846128011122346, + "learning_rate": 1.9856379622761094e-05, + "loss": 0.0039, + "num_tokens": 13469952.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 291.125, + "completions/mean_terminated_length": 291.125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.2977310459324848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.0443333275616169, + "learning_rate": 1.9855835371410296e-05, + "loss": 0.0018, + "num_tokens": 13481649.0, + "reward": 1.3333333730697632, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.29791551374285186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.015118375478778034, + "learning_rate": 1.9855290098272033e-05, + "loss": 0.0006, + "num_tokens": 13487248.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 111.125, + "completions/mean_terminated_length": 111.125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.298099981553219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.04992464650422335, + "learning_rate": 1.9854743803402825e-05, + "loss": 0.002, + "num_tokens": 13490937.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 227.625, + "completions/mean_terminated_length": 227.625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.29828444936358606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.04677146906033158, + "learning_rate": 1.985419648685932e-05, + "loss": 0.0019, + "num_tokens": 13496086.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 369.125, + "completions/mean_terminated_length": 369.125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.29846891717395313, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.03819232643581927, + "learning_rate": 1.9853648148698254e-05, + "loss": 0.0015, + "num_tokens": 13508607.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 196.875, + "completions/mean_terminated_length": 196.875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.29865338498432026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.03640616801567376, + "learning_rate": 1.985309878897647e-05, + "loss": 0.0015, + "num_tokens": 13513094.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 251.625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.29883785279468733, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.08982761995866895, + "learning_rate": 1.9852548407750935e-05, + "loss": 0.0036, + "num_tokens": 13519443.0, + "reward": 1.0806450843811035, + "reward_std": 0.08621328324079514, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08064515888690948, + "rewards/fixed_code_pass_all_test_reward/std": 0.08621330559253693, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 460.875, + "completions/mean_terminated_length": 460.875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.2990223206050544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76953125, + "kl": 0.05403604102320969, + "learning_rate": 1.98519970050787e-05, + "loss": 0.0022, + "num_tokens": 13531226.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 186.5, + "completions/mean_terminated_length": 186.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.29920678841542153, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.03963741788174957, + "learning_rate": 1.985144458101693e-05, + "loss": 0.0016, + "num_tokens": 13535630.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 364.5, + "completions/mean_terminated_length": 364.5, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.2993912562257886, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.06751695438288152, + "learning_rate": 1.98508911356229e-05, + "loss": 0.0027, + "num_tokens": 13545666.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 163.0, + "completions/mean_terminated_length": 163.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2995757240361557, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.10334598645567894, + "learning_rate": 1.9850336668953988e-05, + "loss": 0.0041, + "num_tokens": 13553018.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 322.5, + "completions/mean_terminated_length": 322.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.2997601918465228, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.029492180794477463, + "learning_rate": 1.9849781181067674e-05, + "loss": 0.0012, + "num_tokens": 13559422.0, + "reward": 1.9375, + "reward_std": 0.1157275140285492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 350.5, + "completions/mean_terminated_length": 350.5, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.2999446596568899, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76953125, + "kl": 0.02903251990210265, + "learning_rate": 1.984922467202155e-05, + "loss": 0.0012, + "num_tokens": 13565962.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 183.125, + "completions/mean_terminated_length": 183.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.30012912746725695, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.10230041341856122, + "learning_rate": 1.984866714187331e-05, + "loss": 0.0041, + "num_tokens": 13573203.0, + "reward": 1.75, + "reward_std": 0.15430331230163574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 288.75, + "completions/mean_terminated_length": 288.75, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.3003135952776241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.07317360024899244, + "learning_rate": 1.9848108590680756e-05, + "loss": 0.0029, + "num_tokens": 13580153.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 170.75, + "completions/mean_terminated_length": 170.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.30049806308799115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.10520786978304386, + "learning_rate": 1.984754901850179e-05, + "loss": 0.0042, + "num_tokens": 13589159.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 103.125, + "completions/mean_terminated_length": 103.125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.3006825308983582, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.14203880447894335, + "learning_rate": 1.984698842539444e-05, + "loss": 0.0057, + "num_tokens": 13595272.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.30086699870872535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.08123304788023233, + "learning_rate": 1.9846426811416806e-05, + "loss": 0.0032, + "num_tokens": 13604206.0, + "reward": 1.8055555820465088, + "reward_std": 0.37912923097610474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8055555820465088, + "rewards/fixed_code_pass_all_test_reward/std": 0.37912923097610474, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.3010514665190924, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.022659472131635994, + "learning_rate": 1.984586417662712e-05, + "loss": 0.0009, + "num_tokens": 13609670.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 214.25, + "completions/mean_terminated_length": 214.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.3012359343294595, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.18599138129502535, + "learning_rate": 1.9845300521083713e-05, + "loss": 0.0074, + "num_tokens": 13618520.0, + "reward": 1.7580645084381104, + "reward_std": 0.37301579117774963, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7580645084381104, + "rewards/fixed_code_pass_all_test_reward/std": 0.37301579117774963, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 384.875, + "completions/mean_terminated_length": 384.875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.3014204021398266, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.09427990880794823, + "learning_rate": 1.9844735844845023e-05, + "loss": 0.0038, + "num_tokens": 13631015.0, + "reward": 1.111111044883728, + "reward_std": 0.5988426804542542, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2361111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.40761780738830566, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 394.25, + "completions/mean_terminated_length": 394.25, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.3016048699501937, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.039595776004716754, + "learning_rate": 1.9844170147969585e-05, + "loss": 0.0016, + "num_tokens": 13638561.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 277.75, + "completions/mean_terminated_length": 277.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.30178933776056077, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.10102736484259367, + "learning_rate": 1.9843603430516055e-05, + "loss": 0.004, + "num_tokens": 13646543.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 263.875, + "completions/mean_terminated_length": 263.875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.3019738055709279, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.09597691288217902, + "learning_rate": 1.984303569254318e-05, + "loss": 0.0038, + "num_tokens": 13657318.0, + "reward": 1.774999976158142, + "reward_std": 0.4200340211391449, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 246.125, + "completions/mean_terminated_length": 246.125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.302158273381295, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.08649599319323897, + "learning_rate": 1.984246693410982e-05, + "loss": 0.0035, + "num_tokens": 13663503.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 613.125, + "completions/mean_terminated_length": 613.125, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.30234274119166205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.05350990151055157, + "learning_rate": 1.9841897155274945e-05, + "loss": 0.0021, + "num_tokens": 13679336.0, + "reward": 1.1484375, + "reward_std": 0.639071524143219, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3984375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5021577477455139, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 225.5, + "completions/mean_terminated_length": 225.5, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.3025272090020291, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.0798694253899157, + "learning_rate": 1.9841326356097622e-05, + "loss": 0.0032, + "num_tokens": 13687820.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 224.875, + "completions/mean_terminated_length": 224.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.30271167681239625, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.1079470282420516, + "learning_rate": 1.9840754536637025e-05, + "loss": 0.0043, + "num_tokens": 13696299.0, + "reward": 1.9298245906829834, + "reward_std": 0.12993964552879333, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9298245906829834, + "rewards/fixed_code_pass_all_test_reward/std": 0.12993967533111572, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 159.625, + "completions/mean_terminated_length": 159.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.3028961446227633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.06968524935655296, + "learning_rate": 1.9840181696952446e-05, + "loss": 0.0028, + "num_tokens": 13700560.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 371.75, + "completions/mean_terminated_length": 371.75, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.3030806124331304, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.09412944735959172, + "learning_rate": 1.9839607837103263e-05, + "loss": 0.0038, + "num_tokens": 13710270.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 233.0, + "completions/mean_terminated_length": 233.0, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.3032650802434975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.1541341943666339, + "learning_rate": 1.9839032957148974e-05, + "loss": 0.0062, + "num_tokens": 13715110.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.3034495480538646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.06373753608204424, + "learning_rate": 1.983845705714918e-05, + "loss": 0.0025, + "num_tokens": 13719530.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 123.625, + "completions/mean_terminated_length": 123.625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.30363401586423167, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "kl": 0.16125423833727837, + "learning_rate": 1.9837880137163586e-05, + "loss": 0.0065, + "num_tokens": 13725151.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 236.875, + "completions/mean_terminated_length": 236.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3038184836745988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.09002807457000017, + "learning_rate": 1.9837302197252e-05, + "loss": 0.0036, + "num_tokens": 13732742.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.30400295148496587, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.20437498856335878, + "learning_rate": 1.9836723237474342e-05, + "loss": 0.0082, + "num_tokens": 13739018.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 502.0, + "completions/mean_terminated_length": 502.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.30418741929533294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.06580684497021139, + "learning_rate": 1.983614325789063e-05, + "loss": 0.0026, + "num_tokens": 13748330.0, + "reward": 1.600000023841858, + "reward_std": 0.5014265179634094, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.5014265179634094, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.30437188710570007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.04742681514471769, + "learning_rate": 1.9835562258561005e-05, + "loss": 0.0019, + "num_tokens": 13754404.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 244.75, + "completions/mean_terminated_length": 244.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.30455635491606714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06347104976885021, + "learning_rate": 1.9834980239545686e-05, + "loss": 0.0025, + "num_tokens": 13759410.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 250.0, + "completions/mean_terminated_length": 250.0, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3047408227264342, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.3125, + "kl": 0.14658135455101728, + "learning_rate": 1.9834397200905024e-05, + "loss": 0.0059, + "num_tokens": 13767562.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 356.625, + "completions/mean_terminated_length": 356.625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.30492529053680134, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.10572158545255661, + "learning_rate": 1.983381314269946e-05, + "loss": 0.0042, + "num_tokens": 13778735.0, + "reward": 1.024999976158142, + "reward_std": 0.04629099741578102, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, + "rewards/fixed_code_pass_all_test_reward/std": 0.04629100486636162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 372.25, + "completions/mean_terminated_length": 372.25, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.3051097583471684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.10880009178072214, + "learning_rate": 1.9833228064989543e-05, + "loss": 0.0044, + "num_tokens": 13786089.0, + "reward": 0.9879031777381897, + "reward_std": 0.39051389694213867, + "rewards/fixed_code_pass_all_test_reward/mean": 0.11290322244167328, + "rewards/fixed_code_pass_all_test_reward/std": 0.06678053736686707, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 349.125, + "completions/mean_terminated_length": 349.125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.3052942261575355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.08194419136270881, + "learning_rate": 1.983264196783593e-05, + "loss": 0.0033, + "num_tokens": 13793018.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3054786939679026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.08314052177593112, + "learning_rate": 1.9832054851299388e-05, + "loss": 0.0033, + "num_tokens": 13804123.0, + "reward": 1.2050971984863281, + "reward_std": 0.08628341555595398, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20509707927703857, + "rewards/fixed_code_pass_all_test_reward/std": 0.08628340065479279, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 220.75, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.3056631617782697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.06289996206760406, + "learning_rate": 1.9831466715440787e-05, + "loss": 0.0025, + "num_tokens": 13808777.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 248.625, + "completions/mean_terminated_length": 248.625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.30584762958863676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.10293893702328205, + "learning_rate": 1.9830877560321094e-05, + "loss": 0.0041, + "num_tokens": 13817846.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 406.125, + "completions/mean_terminated_length": 406.125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.3060320973990039, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.07124783424660563, + "learning_rate": 1.983028738600139e-05, + "loss": 0.0028, + "num_tokens": 13825855.0, + "reward": 1.4921875, + "reward_std": 0.6716413497924805, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6171875, + "rewards/fixed_code_pass_all_test_reward/std": 0.42017680406570435, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 118.625, + "completions/mean_terminated_length": 118.625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.30621656520937096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.09433985035866499, + "learning_rate": 1.9829696192542864e-05, + "loss": 0.0038, + "num_tokens": 13829428.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 509.375, + "completions/mean_terminated_length": 509.375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.30640103301973803, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.054964892100542784, + "learning_rate": 1.9829103980006808e-05, + "loss": 0.0022, + "num_tokens": 13838287.0, + "reward": 1.3854167461395264, + "reward_std": 0.5040483474731445, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6354166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.32443055510520935, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 248.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.30658550083010516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.0928284740075469, + "learning_rate": 1.982851074845461e-05, + "loss": 0.0037, + "num_tokens": 13861094.0, + "reward": 1.966867446899414, + "reward_std": 0.0937129408121109, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9668674468994141, + "rewards/fixed_code_pass_all_test_reward/std": 0.0937129482626915, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 429.125, + "completions/mean_terminated_length": 429.125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.30676996864047223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.05304495617747307, + "learning_rate": 1.9827916497947787e-05, + "loss": 0.0021, + "num_tokens": 13870103.0, + "reward": 1.0214285850524902, + "reward_std": 0.03967802971601486, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02142857201397419, + "rewards/fixed_code_pass_all_test_reward/std": 0.03967800736427307, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 183.125, + "completions/mean_terminated_length": 183.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3069544364508393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.11310707870870829, + "learning_rate": 1.982732122854793e-05, + "loss": 0.0045, + "num_tokens": 13874312.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 205.625, + "completions/mean_terminated_length": 205.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.30713890426120644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.06347531673964113, + "learning_rate": 1.9826724940316767e-05, + "loss": 0.0025, + "num_tokens": 13878773.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 336.875, + "completions/mean_terminated_length": 336.875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.3073233720715735, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.08976999670267105, + "learning_rate": 1.9826127633316106e-05, + "loss": 0.0036, + "num_tokens": 13886052.0, + "reward": 1.4943182468414307, + "reward_std": 0.25271546840667725, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4943181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.25271546840667725, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 281.25, + "completions/mean_terminated_length": 281.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3075078398819406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.06499262992292643, + "learning_rate": 1.9825529307607883e-05, + "loss": 0.0026, + "num_tokens": 13895134.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 226.125, + "completions/mean_terminated_length": 226.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.3076923076923077, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.07601966988295317, + "learning_rate": 1.9824929963254118e-05, + "loss": 0.003, + "num_tokens": 13902727.0, + "reward": 1.975000023841858, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 240.625, + "completions/mean_terminated_length": 240.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.3078767755026748, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.16574146365746856, + "learning_rate": 1.9824329600316948e-05, + "loss": 0.0066, + "num_tokens": 13910788.0, + "reward": 1.6734694242477417, + "reward_std": 0.37882933020591736, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6734694242477417, + "rewards/fixed_code_pass_all_test_reward/std": 0.37882930040359497, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 193.5, + "completions/mean_terminated_length": 193.5, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.30806124331304185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.059343681670725346, + "learning_rate": 1.9823728218858624e-05, + "loss": 0.0024, + "num_tokens": 13915184.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.308245711123409, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.08318747580051422, + "learning_rate": 1.9823125818941484e-05, + "loss": 0.0033, + "num_tokens": 13921363.0, + "reward": 1.8571429252624512, + "reward_std": 0.23741397261619568, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.23741400241851807, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 488.0, + "completions/mean_terminated_length": 488.0, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.30843017893377606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.08815860049799085, + "learning_rate": 1.9822522400627985e-05, + "loss": 0.0035, + "num_tokens": 13935363.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 468.0, + "completions/mean_terminated_length": 468.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.30861464674414313, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.07267849054187536, + "learning_rate": 1.9821917963980686e-05, + "loss": 0.0029, + "num_tokens": 13948811.0, + "reward": 1.3303570747375488, + "reward_std": 0.358034610748291, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4553571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.39849844574928284, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 646.375, + "completions/mean_terminated_length": 646.375, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "epoch": 0.30879911455451026, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.05856925481930375, + "learning_rate": 1.9821312509062247e-05, + "loss": 0.0023, + "num_tokens": 13965486.0, + "reward": 1.03125, + "reward_std": 0.4317220449447632, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, + "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.30898358236487733, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.46875, + "kl": 0.10472957044839859, + "learning_rate": 1.982070603593544e-05, + "loss": 0.0042, + "num_tokens": 13969772.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.3091680501752444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.09940497297793627, + "learning_rate": 1.9820098544663142e-05, + "loss": 0.004, + "num_tokens": 13979868.0, + "reward": 1.6530611515045166, + "reward_std": 0.4158174395561218, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6530612707138062, + "rewards/fixed_code_pass_all_test_reward/std": 0.4158174693584442, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 498.0, + "completions/mean_terminated_length": 498.0, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.30935251798561153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.07028262037783861, + "learning_rate": 1.981949003530833e-05, + "loss": 0.0028, + "num_tokens": 13990228.0, + "reward": 0.8571428060531616, + "reward_std": 0.3662113845348358, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1071428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.1478712111711502, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3095369857959786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.546875, + "kl": 0.17270728386938572, + "learning_rate": 1.9818880507934094e-05, + "loss": 0.0069, + "num_tokens": 13997500.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.3097214536063457, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.84375, + "kl": 0.07210355903953314, + "learning_rate": 1.981826996260362e-05, + "loss": 0.0029, + "num_tokens": 14001604.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 244.125, + "completions/mean_terminated_length": 244.125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.3099059214167128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.1321876011788845, + "learning_rate": 1.9817658399380212e-05, + "loss": 0.0053, + "num_tokens": 14007437.0, + "reward": 1.5, + "reward_std": 0.3401506841182709, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.3401506841182709, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.3100903892270799, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.0725417211651802, + "learning_rate": 1.9817045818327268e-05, + "loss": 0.0029, + "num_tokens": 14017912.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 179.25, + "completions/mean_terminated_length": 179.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.31027485703744695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.07207641378045082, + "learning_rate": 1.9816432219508297e-05, + "loss": 0.0029, + "num_tokens": 14022170.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3104593248478141, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.20165031217038631, + "learning_rate": 1.9815817602986916e-05, + "loss": 0.0081, + "num_tokens": 14031580.0, + "reward": 1.2678570747375488, + "reward_std": 0.45456862449645996, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 314.5, + "completions/mean_terminated_length": 314.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.31064379265818115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0709644271992147, + "learning_rate": 1.981520196882684e-05, + "loss": 0.0028, + "num_tokens": 14042056.0, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, + "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 219.25, + "completions/mean_terminated_length": 219.25, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3108282604685482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.11232777498662472, + "learning_rate": 1.98145853170919e-05, + "loss": 0.0045, + "num_tokens": 14050458.0, + "reward": 1.8081395626068115, + "reward_std": 0.3606526255607605, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8081395626068115, + "rewards/fixed_code_pass_all_test_reward/std": 0.3606526255607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 205.125, + "completions/mean_terminated_length": 205.125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.31101272827891535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.12830605870112777, + "learning_rate": 1.9813967647846018e-05, + "loss": 0.0051, + "num_tokens": 14058667.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 220.5, + "completions/mean_terminated_length": 220.5, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.3111971960892824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.07878115074709058, + "learning_rate": 1.981334896115324e-05, + "loss": 0.0032, + "num_tokens": 14070223.0, + "reward": 1.423076868057251, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.42307692766189575, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 486.25, + "completions/mean_terminated_length": 486.25, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.3113816638996495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.05654013575986028, + "learning_rate": 1.9812729257077695e-05, + "loss": 0.0023, + "num_tokens": 14084353.0, + "reward": 1.120192289352417, + "reward_std": 0.4987039268016815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.24519230425357819, + "rewards/fixed_code_pass_all_test_reward/std": 0.23162616789340973, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 207.0, + "completions/mean_terminated_length": 207.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3115661317100166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.030497669125907123, + "learning_rate": 1.9812108535683637e-05, + "loss": 0.0012, + "num_tokens": 14089065.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 485.75, + "completions/mean_terminated_length": 485.75, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.3117505995203837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0986328125, + "kl": 0.08446342032402754, + "learning_rate": 1.981148679703542e-05, + "loss": 0.0034, + "num_tokens": 14101935.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 204.875, + "completions/mean_terminated_length": 204.875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.31193506733075077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.08789316844195127, + "learning_rate": 1.9810864041197502e-05, + "loss": 0.0035, + "num_tokens": 14108662.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 295.75, + "completions/mean_terminated_length": 295.75, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3121195351411179, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.0816650060005486, + "learning_rate": 1.9810240268234438e-05, + "loss": 0.0033, + "num_tokens": 14118828.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 240.5, + "completions/mean_terminated_length": 240.5, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.31230400295148497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.08146433951333165, + "learning_rate": 1.9809615478210907e-05, + "loss": 0.0033, + "num_tokens": 14128328.0, + "reward": 1.2307692766189575, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23076923191547394, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 182.125, + "completions/mean_terminated_length": 182.125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.31248847076185204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.05591106438077986, + "learning_rate": 1.9808989671191675e-05, + "loss": 0.0022, + "num_tokens": 14135369.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 217.375, + "completions/mean_terminated_length": 217.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.3126729385722192, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.08190474798902869, + "learning_rate": 1.9808362847241627e-05, + "loss": 0.0033, + "num_tokens": 14145028.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 391.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.31285740638258625, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.060504299122840166, + "learning_rate": 1.9807735006425747e-05, + "loss": 0.0024, + "num_tokens": 14153055.0, + "reward": 1.5833332538604736, + "reward_std": 0.28171810507774353, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.28171807527542114, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 152.625, + "completions/mean_terminated_length": 152.625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3130418741929533, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.96875, + "kl": 0.09807438356801867, + "learning_rate": 1.980710614880912e-05, + "loss": 0.0039, + "num_tokens": 14157100.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.31322634200332045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.10653264913707972, + "learning_rate": 1.980647627445695e-05, + "loss": 0.0043, + "num_tokens": 14165138.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 370.0, + "completions/mean_terminated_length": 370.0, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.3134108098136875, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.06363171525299549, + "learning_rate": 1.980584538343453e-05, + "loss": 0.0025, + "num_tokens": 14173082.0, + "reward": 1.7589285373687744, + "reward_std": 0.4467855989933014, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7589285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.446785569190979, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 154.75, + "completions/mean_terminated_length": 154.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.3135952776240546, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.546875, + "kl": 0.048664238303899765, + "learning_rate": 1.9805213475807274e-05, + "loss": 0.0019, + "num_tokens": 14179672.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 229.25, + "completions/mean_terminated_length": 229.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.3137797454344217, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.1875, + "kl": 0.1719646283891052, + "learning_rate": 1.9804580551640685e-05, + "loss": 0.0069, + "num_tokens": 14184546.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 272.125, + "completions/mean_terminated_length": 272.125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3139642132447888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.06589964078739285, + "learning_rate": 1.9803946611000394e-05, + "loss": 0.0026, + "num_tokens": 14191123.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 445.5, + "completions/mean_terminated_length": 445.5, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.31414868105515587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201171875, + "kl": 0.051606345223262906, + "learning_rate": 1.980331165395211e-05, + "loss": 0.0021, + "num_tokens": 14200471.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 245.125, + "completions/mean_terminated_length": 245.125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.314333148865523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.08974823076277971, + "learning_rate": 1.9802675680561667e-05, + "loss": 0.0036, + "num_tokens": 14208904.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 322.125, + "completions/mean_terminated_length": 322.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.31451761667589007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.60546875, + "kl": 0.04024562076665461, + "learning_rate": 1.9802038690895e-05, + "loss": 0.0016, + "num_tokens": 14215961.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 171.75, + "completions/mean_terminated_length": 171.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.31470208448625714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.12559832073748112, + "learning_rate": 1.980140068501815e-05, + "loss": 0.005, + "num_tokens": 14221079.0, + "reward": 1.875, + "reward_std": 0.2368176281452179, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2368176281452179, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 212.5, + "completions/mean_terminated_length": 212.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.3148865522966242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.12382021732628345, + "learning_rate": 1.9800761662997254e-05, + "loss": 0.005, + "num_tokens": 14230035.0, + "reward": 1.6538461446762085, + "reward_std": 0.40703868865966797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.40703868865966797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 280.875, + "completions/mean_terminated_length": 280.875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.31507102010699134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.082843370269984, + "learning_rate": 1.980012162489856e-05, + "loss": 0.0033, + "num_tokens": 14239602.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 300.5, + "completions/mean_terminated_length": 300.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.3152554879173584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.07852113293483853, + "learning_rate": 1.9799480570788433e-05, + "loss": 0.0031, + "num_tokens": 14246878.0, + "reward": 1.21875, + "reward_std": 0.24775780737400055, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, + "rewards/fixed_code_pass_all_test_reward/std": 0.24775780737400055, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 205.125, + "completions/mean_terminated_length": 205.125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.3154399557277255, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.09031384671106935, + "learning_rate": 1.9798838500733327e-05, + "loss": 0.0036, + "num_tokens": 14255015.0, + "reward": 1.1931817531585693, + "reward_std": 0.09436090290546417, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1931818276643753, + "rewards/fixed_code_pass_all_test_reward/std": 0.09436088055372238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 231.0, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.3156244235380926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.10014249850064516, + "learning_rate": 1.979819541479981e-05, + "loss": 0.004, + "num_tokens": 14263255.0, + "reward": 1.6959459781646729, + "reward_std": 0.20822589099407196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6959459781646729, + "rewards/fixed_code_pass_all_test_reward/std": 0.20822592079639435, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 303.375, + "completions/mean_terminated_length": 303.375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.3158088913484597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.07781041180714965, + "learning_rate": 1.979755131305455e-05, + "loss": 0.0031, + "num_tokens": 14272810.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.31599335915882676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2490234375, + "kl": 0.05364137451397255, + "learning_rate": 1.979690619556433e-05, + "loss": 0.0021, + "num_tokens": 14277736.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 168.75, + "completions/mean_terminated_length": 168.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.3161778269691939, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.06099038943648338, + "learning_rate": 1.979626006239602e-05, + "loss": 0.0024, + "num_tokens": 14285470.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 313.75, + "completions/mean_terminated_length": 313.75, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.31636229477956096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.057390640024095774, + "learning_rate": 1.979561291361661e-05, + "loss": 0.0023, + "num_tokens": 14296220.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 161.25, + "completions/mean_terminated_length": 161.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.31654676258992803, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.07466377504169941, + "learning_rate": 1.9794964749293203e-05, + "loss": 0.003, + "num_tokens": 14303702.0, + "reward": 1.9166667461395264, + "reward_std": 0.15430331230163574, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.15430334210395813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 297.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.31673123040029516, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.042671683710068464, + "learning_rate": 1.9794315569492982e-05, + "loss": 0.0017, + "num_tokens": 14309856.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.31691569821066223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.03824770194478333, + "learning_rate": 1.979366537428326e-05, + "loss": 0.0015, + "num_tokens": 14317513.0, + "reward": 1.25, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 146.375, + "completions/mean_terminated_length": 146.375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.3171001660210293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.02705427212640643, + "learning_rate": 1.979301416373144e-05, + "loss": 0.0011, + "num_tokens": 14321532.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 676.875, + "completions/mean_terminated_length": 676.875, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "epoch": 0.31728463383139643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029052734375, + "kl": 0.026389537379145622, + "learning_rate": 1.9792361937905038e-05, + "loss": 0.0011, + "num_tokens": 14338331.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 148.0, + "completions/mean_terminated_length": 148.0, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.3174691016417635, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.09620907926000655, + "learning_rate": 1.979170869687167e-05, + "loss": 0.0038, + "num_tokens": 14344643.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 246.625, + "completions/mean_terminated_length": 246.625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.3176535694521306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.10019650589674711, + "learning_rate": 1.9791054440699057e-05, + "loss": 0.004, + "num_tokens": 14354152.0, + "reward": 1.446428656578064, + "reward_std": 0.5529538989067078, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.3740878105163574, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 179.5, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.3178380372624977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.0830680918879807, + "learning_rate": 1.9790399169455033e-05, + "loss": 0.0033, + "num_tokens": 14361228.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 454.625, + "completions/mean_terminated_length": 454.625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.3180225050728648, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78125, + "kl": 0.042373246513307095, + "learning_rate": 1.9789742883207532e-05, + "loss": 0.0017, + "num_tokens": 14370761.0, + "reward": 1.3318965435028076, + "reward_std": 0.13410648703575134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3318965435028076, + "rewards/fixed_code_pass_all_test_reward/std": 0.13410645723342896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.31820697288323185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.07158723333850503, + "learning_rate": 1.978908558202459e-05, + "loss": 0.0029, + "num_tokens": 14378992.0, + "reward": 1.688596487045288, + "reward_std": 0.30538409948349, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6885964870452881, + "rewards/fixed_code_pass_all_test_reward/std": 0.3053841292858124, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 169.625, + "completions/mean_terminated_length": 169.625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.318391440693599, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.10440605506300926, + "learning_rate": 1.9788427265974355e-05, + "loss": 0.0042, + "num_tokens": 14385741.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 219.5, + "completions/mean_terminated_length": 219.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.31857590850396605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.12287145247682929, + "learning_rate": 1.9787767935125072e-05, + "loss": 0.0049, + "num_tokens": 14396905.0, + "reward": 1.839962124824524, + "reward_std": 0.19996216893196106, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8399621248245239, + "rewards/fixed_code_pass_all_test_reward/std": 0.19996218383312225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 249.875, + "completions/mean_terminated_length": 249.875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3187603763143331, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.06132039101794362, + "learning_rate": 1.97871075895451e-05, + "loss": 0.0025, + "num_tokens": 14403200.0, + "reward": 1.8690476417541504, + "reward_std": 0.2935435473918915, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8690475821495056, + "rewards/fixed_code_pass_all_test_reward/std": 0.2935435175895691, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 250.125, + "completions/mean_terminated_length": 250.125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.31894484412470026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.09117587888613343, + "learning_rate": 1.97864462293029e-05, + "loss": 0.0036, + "num_tokens": 14411833.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 495.25, + "completions/mean_terminated_length": 495.25, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.31912931193506733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3203125, + "kl": 0.06196676520630717, + "learning_rate": 1.9785783854467037e-05, + "loss": 0.0025, + "num_tokens": 14428219.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 367.875, + "completions/mean_terminated_length": 367.875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.3193137797454344, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.04813338629901409, + "learning_rate": 1.978512046510618e-05, + "loss": 0.0019, + "num_tokens": 14436666.0, + "reward": 1.21875, + "reward_std": 0.4712729752063751, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 235.25, + "completions/mean_terminated_length": 235.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.31949824755580153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.40234375, + "kl": 0.1605838891118765, + "learning_rate": 1.9784456061289105e-05, + "loss": 0.0064, + "num_tokens": 14445188.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 156.25, + "completions/mean_terminated_length": 156.25, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.3196827153661686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2578125, + "kl": 0.10109540540724993, + "learning_rate": 1.9783790643084694e-05, + "loss": 0.004, + "num_tokens": 14449262.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 594.75, + "completions/mean_terminated_length": 594.75, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.3198671831765357, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.609375, + "kl": 0.04369717533700168, + "learning_rate": 1.978312421056193e-05, + "loss": 0.0017, + "num_tokens": 14467316.0, + "reward": 1.8645833730697632, + "reward_std": 0.3505593538284302, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.35055938363075256, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 194.25, + "completions/mean_terminated_length": 194.25, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.3200516509869028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.08497690130025148, + "learning_rate": 1.978245676378991e-05, + "loss": 0.0034, + "num_tokens": 14473918.0, + "reward": 1.8333333730697632, + "reward_std": 0.35634833574295044, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 184.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3202361187972699, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.08126162411645055, + "learning_rate": 1.9781788302837824e-05, + "loss": 0.0032, + "num_tokens": 14478598.0, + "reward": 1.9791666269302368, + "reward_std": 0.058925606310367584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 217.25, + "completions/mean_terminated_length": 217.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.32042058660763695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.08717028517276049, + "learning_rate": 1.9781118827774978e-05, + "loss": 0.0035, + "num_tokens": 14489008.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 283.875, + "completions/mean_terminated_length": 283.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.3206050544180041, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.04792699869722128, + "learning_rate": 1.9780448338670775e-05, + "loss": 0.0019, + "num_tokens": 14495631.0, + "reward": 1.7678570747375488, + "reward_std": 0.43153735995292664, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.43153735995292664, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 342.625, + "completions/mean_terminated_length": 342.625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.32078952222837115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.05925857205875218, + "learning_rate": 1.9779776835594734e-05, + "loss": 0.0024, + "num_tokens": 14503556.0, + "reward": 1.6958333253860474, + "reward_std": 0.4314749240875244, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6958333253860474, + "rewards/fixed_code_pass_all_test_reward/std": 0.4314749836921692, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 186.0, + "completions/mean_terminated_length": 186.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3209739900387382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.12323627155274153, + "learning_rate": 1.9779104318616464e-05, + "loss": 0.0049, + "num_tokens": 14508860.0, + "reward": 1.5541666746139526, + "reward_std": 0.33990541100502014, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5541666746139526, + "rewards/fixed_code_pass_all_test_reward/std": 0.33990544080734253, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 238.5, + "completions/mean_terminated_length": 238.5, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.32115845784910535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.0983538986183703, + "learning_rate": 1.9778430787805692e-05, + "loss": 0.0039, + "num_tokens": 14516800.0, + "reward": 1.9812500476837158, + "reward_std": 0.0530330166220665, + "rewards/fixed_code_pass_all_test_reward/mean": 0.981249988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.053033001720905304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.3213429256594724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1865234375, + "kl": 0.07698617875576019, + "learning_rate": 1.977775624323224e-05, + "loss": 0.0031, + "num_tokens": 14521807.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 191.0, + "completions/mean_terminated_length": 191.0, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.3215273934698395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.03197456058114767, + "learning_rate": 1.977708068496605e-05, + "loss": 0.0013, + "num_tokens": 14526871.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 253.125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.3217118612802066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.06734870001673698, + "learning_rate": 1.977640411307715e-05, + "loss": 0.0027, + "num_tokens": 14532160.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 435.625, + "completions/mean_terminated_length": 205.2857208251953, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.3218963290905737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.08839738339884207, + "learning_rate": 1.9775726527635687e-05, + "loss": 0.0035, + "num_tokens": 14539581.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 161.75, + "completions/mean_terminated_length": 161.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.32208079690094077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.06652000150643289, + "learning_rate": 1.9775047928711906e-05, + "loss": 0.0027, + "num_tokens": 14543747.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 200.125, + "completions/mean_terminated_length": 200.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.3222652647113079, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.10362414317205548, + "learning_rate": 1.977436831637616e-05, + "loss": 0.0041, + "num_tokens": 14552860.0, + "reward": 1.0762712955474854, + "reward_std": 0.04707559198141098, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0762711837887764, + "rewards/fixed_code_pass_all_test_reward/std": 0.04707559570670128, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.32244973252167497, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.65625, + "kl": 0.07780564576387405, + "learning_rate": 1.977368769069891e-05, + "loss": 0.0031, + "num_tokens": 14557097.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 143.625, + "completions/mean_terminated_length": 143.625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.32263420033204204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.09352086530998349, + "learning_rate": 1.9773006051750716e-05, + "loss": 0.0037, + "num_tokens": 14561118.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 342.125, + "completions/mean_terminated_length": 342.125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.32281866814240917, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.10218691546469927, + "learning_rate": 1.9772323399602243e-05, + "loss": 0.0041, + "num_tokens": 14568567.0, + "reward": 1.7010868787765503, + "reward_std": 0.2475235015153885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7010869979858398, + "rewards/fixed_code_pass_all_test_reward/std": 0.24752351641654968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 258.5, + "completions/mean_terminated_length": 258.5, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.32300313595277624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.07289698161184788, + "learning_rate": 1.9771639734324272e-05, + "loss": 0.0029, + "num_tokens": 14574947.0, + "reward": 1.4479166269302368, + "reward_std": 0.23543904721736908, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4479166567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.23543907701969147, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3231876037631433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.057445792481303215, + "learning_rate": 1.9770955055987673e-05, + "loss": 0.0023, + "num_tokens": 14584423.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.32337207157351044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.07450989726930857, + "learning_rate": 1.9770269364663433e-05, + "loss": 0.003, + "num_tokens": 14596981.0, + "reward": 1.034482717514038, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.03448275849223137, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 206.875, + "completions/mean_terminated_length": 206.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.3235565393838775, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.11056117340922356, + "learning_rate": 1.9769582660422636e-05, + "loss": 0.0044, + "num_tokens": 14604300.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 144.125, + "completions/mean_terminated_length": 144.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3237410071942446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1923828125, + "kl": 0.14394644927233458, + "learning_rate": 1.976889494333648e-05, + "loss": 0.0058, + "num_tokens": 14611061.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 302.5, + "completions/mean_terminated_length": 302.5, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.3239254750046117, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.06253304937854409, + "learning_rate": 1.9768206213476258e-05, + "loss": 0.0025, + "num_tokens": 14617337.0, + "reward": 1.53125, + "reward_std": 0.6838376522064209, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.39387044310569763, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 252.0, + "completions/mean_terminated_length": 252.0, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.3241099428149788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.09701111260801554, + "learning_rate": 1.976751647091338e-05, + "loss": 0.0039, + "num_tokens": 14628457.0, + "reward": 1.7740384340286255, + "reward_std": 0.2652272582054138, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7740384340286255, + "rewards/fixed_code_pass_all_test_reward/std": 0.2652272880077362, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 362.75, + "completions/mean_terminated_length": 362.75, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.32429441062534586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.04948270693421364, + "learning_rate": 1.9766825715719347e-05, + "loss": 0.002, + "num_tokens": 14636495.0, + "reward": 1.5961538553237915, + "reward_std": 0.17804235219955444, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5961538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.17804233729839325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 182.75, + "completions/mean_terminated_length": 182.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.324478878435713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.06723115220665932, + "learning_rate": 1.9766133947965773e-05, + "loss": 0.0027, + "num_tokens": 14641021.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.32466334624608006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.0807058997452259, + "learning_rate": 1.9765441167724376e-05, + "loss": 0.0032, + "num_tokens": 14647569.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 176.75, + "completions/mean_terminated_length": 176.75, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.32484781405644714, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.15625, + "kl": 0.13152326829731464, + "learning_rate": 1.9764747375066984e-05, + "loss": 0.0053, + "num_tokens": 14654335.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 170.875, + "completions/mean_terminated_length": 170.875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.32503228186681427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.10068637505173683, + "learning_rate": 1.9764052570065518e-05, + "loss": 0.004, + "num_tokens": 14658550.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 285.0, + "completions/mean_terminated_length": 285.0, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.32521674967718134, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.06980323418974876, + "learning_rate": 1.9763356752792015e-05, + "loss": 0.0028, + "num_tokens": 14665270.0, + "reward": 1.6682692766189575, + "reward_std": 0.21059979498386383, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6682692170143127, + "rewards/fixed_code_pass_all_test_reward/std": 0.21059982478618622, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 246.125, + "completions/mean_terminated_length": 246.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.3254012174875484, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2177734375, + "kl": 0.09653685847297311, + "learning_rate": 1.9762659923318612e-05, + "loss": 0.0039, + "num_tokens": 14674503.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 316.375, + "completions/mean_terminated_length": 316.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.32558568529791554, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.08440969977527857, + "learning_rate": 1.976196208171755e-05, + "loss": 0.0034, + "num_tokens": 14685618.0, + "reward": 1.575657844543457, + "reward_std": 0.4741930067539215, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7006579041481018, + "rewards/fixed_code_pass_all_test_reward/std": 0.43056395649909973, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3257701531082826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.11532928328961134, + "learning_rate": 1.9761263228061177e-05, + "loss": 0.0046, + "num_tokens": 14695811.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 250.875, + "completions/mean_terminated_length": 250.875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.3259546209186497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.08350975532084703, + "learning_rate": 1.9760563362421946e-05, + "loss": 0.0033, + "num_tokens": 14703234.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.3261390887290168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.07726488402113318, + "learning_rate": 1.9759862484872416e-05, + "loss": 0.0031, + "num_tokens": 14715601.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 302.375, + "completions/mean_terminated_length": 302.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.3263235565393839, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.07589742448180914, + "learning_rate": 1.9759160595485246e-05, + "loss": 0.003, + "num_tokens": 14722644.0, + "reward": 1.5416666269302368, + "reward_std": 0.494011789560318, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.4940117597579956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 358.375, + "completions/mean_terminated_length": 358.375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.32650802434975096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.06759870890527964, + "learning_rate": 1.9758457694333205e-05, + "loss": 0.0027, + "num_tokens": 14733911.0, + "reward": 1.08695650100708, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08695652335882187, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.3266924921601181, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.07559532998129725, + "learning_rate": 1.975775378148917e-05, + "loss": 0.003, + "num_tokens": 14743070.0, + "reward": 1.4241071939468384, + "reward_std": 0.12725728750228882, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4241071343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.1272573173046112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 290.375, + "completions/mean_terminated_length": 290.375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.32687695997048516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.06814621575176716, + "learning_rate": 1.9757048857026105e-05, + "loss": 0.0027, + "num_tokens": 14748457.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.32706142778085223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.0432502212934196, + "learning_rate": 1.9756342921017105e-05, + "loss": 0.0017, + "num_tokens": 14752979.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 280.625, + "completions/mean_terminated_length": 280.625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.3272458955912193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.036549957701936364, + "learning_rate": 1.975563597353535e-05, + "loss": 0.0015, + "num_tokens": 14758408.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 501.25, + "completions/mean_terminated_length": 501.25, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.32743036340158643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.61328125, + "kl": 0.11061828769743443, + "learning_rate": 1.9754928014654134e-05, + "loss": 0.0044, + "num_tokens": 14768354.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.3276148312119535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.10955561697483063, + "learning_rate": 1.9754219044446855e-05, + "loss": 0.0044, + "num_tokens": 14776715.0, + "reward": 1.9895832538604736, + "reward_std": 0.01928795501589775, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9895833134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.019287927076220512, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 292.0, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.3277992990223206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.06415437115356326, + "learning_rate": 1.9753509062987008e-05, + "loss": 0.0026, + "num_tokens": 14783779.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 429.625, + "completions/mean_terminated_length": 429.625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.3279837668326877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.06184139382094145, + "learning_rate": 1.9752798070348203e-05, + "loss": 0.0025, + "num_tokens": 14794976.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 361.875, + "completions/mean_terminated_length": 361.875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.3281682346430548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.04192975303158164, + "learning_rate": 1.9752086066604157e-05, + "loss": 0.0017, + "num_tokens": 14802799.0, + "reward": 1.1818182468414307, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.32835270245342185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.03831974393688142, + "learning_rate": 1.9751373051828674e-05, + "loss": 0.0015, + "num_tokens": 14808329.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 433.375, + "completions/mean_terminated_length": 433.375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.328537170263789, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.07447442412376404, + "learning_rate": 1.9750659026095684e-05, + "loss": 0.003, + "num_tokens": 14817148.0, + "reward": 1.875, + "reward_std": 0.13363061845302582, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.32872163807415605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.07274224190041423, + "learning_rate": 1.9749943989479206e-05, + "loss": 0.0029, + "num_tokens": 14824733.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 111.375, + "completions/mean_terminated_length": 111.375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3289061058845231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.220703125, + "kl": 0.14287777710705996, + "learning_rate": 1.974922794205338e-05, + "loss": 0.0057, + "num_tokens": 14830736.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 434.5, + "completions/mean_terminated_length": 434.5, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.32909057369489025, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.07484631799161434, + "learning_rate": 1.974851088389243e-05, + "loss": 0.003, + "num_tokens": 14838612.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 368.125, + "completions/mean_terminated_length": 368.125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.3292750415052573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78125, + "kl": 0.06566274212673306, + "learning_rate": 1.97477928150707e-05, + "loss": 0.0026, + "num_tokens": 14849221.0, + "reward": 1.845070481300354, + "reward_std": 0.13039718568325043, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8450704216957092, + "rewards/fixed_code_pass_all_test_reward/std": 0.13039720058441162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.3294595093156244, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18359375, + "kl": 0.10184938367456198, + "learning_rate": 1.9747073735662635e-05, + "loss": 0.0041, + "num_tokens": 14858932.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 291.375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.3296439771259915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.09796668263152242, + "learning_rate": 1.9746353645742787e-05, + "loss": 0.0039, + "num_tokens": 14865335.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 374.625, + "completions/mean_terminated_length": 374.625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.3298284449363586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.04449487570673227, + "learning_rate": 1.9745632545385806e-05, + "loss": 0.0018, + "num_tokens": 14873028.0, + "reward": 1.45652174949646, + "reward_std": 0.7267876863479614, + "rewards/fixed_code_pass_all_test_reward/mean": 0.58152174949646, + "rewards/fixed_code_pass_all_test_reward/std": 0.4868996739387512, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 720.625, + "completions/mean_terminated_length": 720.625, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.3300129127467257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5625, + "kl": 0.03969099558889866, + "learning_rate": 1.9744910434666448e-05, + "loss": 0.0016, + "num_tokens": 14886681.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 284.375, + "completions/mean_terminated_length": 284.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.3301973805570928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0898564518429339, + "learning_rate": 1.9744187313659584e-05, + "loss": 0.0036, + "num_tokens": 14895284.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 768.875, + "completions/mean_terminated_length": 586.1428833007812, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.3303818483674599, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.63671875, + "kl": 0.02595028094947338, + "learning_rate": 1.974346318244018e-05, + "loss": 0.001, + "num_tokens": 14909691.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 239.5, + "completions/mean_terminated_length": 239.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.33056631617782695, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.046057891100645065, + "learning_rate": 1.9742738041083308e-05, + "loss": 0.0018, + "num_tokens": 14914487.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 392.375, + "completions/mean_terminated_length": 392.375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.3307507839881941, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.07713411701843143, + "learning_rate": 1.9742011889664144e-05, + "loss": 0.0031, + "num_tokens": 14924410.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 367.75, + "completions/mean_terminated_length": 367.75, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.33093525179856115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.04441390396095812, + "learning_rate": 1.9741284728257976e-05, + "loss": 0.0018, + "num_tokens": 14934248.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 586.875, + "completions/mean_terminated_length": 378.14288330078125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.3311197196089282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7265625, + "kl": 0.058024664875119925, + "learning_rate": 1.9740556556940187e-05, + "loss": 0.0023, + "num_tokens": 14943599.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.33130418741929535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.06734179286286235, + "learning_rate": 1.9739827375786273e-05, + "loss": 0.0027, + "num_tokens": 14952592.0, + "reward": 1.375, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 289.625, + "completions/mean_terminated_length": 289.625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.3314886552296624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.05632603308185935, + "learning_rate": 1.9739097184871823e-05, + "loss": 0.0023, + "num_tokens": 14958765.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 500.125, + "completions/mean_terminated_length": 500.125, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.3316731230400295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.03952726791612804, + "learning_rate": 1.9738365984272544e-05, + "loss": 0.0016, + "num_tokens": 14968414.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 398.25, + "completions/mean_terminated_length": 398.25, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.3318575908503966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.05610333103686571, + "learning_rate": 1.973763377406424e-05, + "loss": 0.0022, + "num_tokens": 14978496.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 485.875, + "completions/mean_terminated_length": 485.875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.3320420586607637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.671875, + "kl": 0.025123853236436844, + "learning_rate": 1.9736900554322824e-05, + "loss": 0.001, + "num_tokens": 14992063.0, + "reward": 1.128151297569275, + "reward_std": 0.23729005455970764, + "rewards/fixed_code_pass_all_test_reward/mean": 0.12815126776695251, + "rewards/fixed_code_pass_all_test_reward/std": 0.23729003965854645, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 585.0, + "completions/mean_terminated_length": 585.0, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.33222652647113077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.039046943886205554, + "learning_rate": 1.973616632512431e-05, + "loss": 0.0016, + "num_tokens": 15002527.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 399.0, + "completions/mean_terminated_length": 399.0, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.3324109942814979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2119140625, + "kl": 0.07029687252361327, + "learning_rate": 1.9735431086544818e-05, + "loss": 0.0028, + "num_tokens": 15012271.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 342.5, + "completions/mean_terminated_length": 342.5, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.33259546209186497, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.10851633409038186, + "learning_rate": 1.973469483866057e-05, + "loss": 0.0043, + "num_tokens": 15022867.0, + "reward": 1.3935811519622803, + "reward_std": 0.1874489039182663, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3935810923576355, + "rewards/fixed_code_pass_all_test_reward/std": 0.1874489188194275, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 321.625, + "completions/mean_terminated_length": 321.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.33277992990223204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.04353495198301971, + "learning_rate": 1.9733957581547905e-05, + "loss": 0.0017, + "num_tokens": 15028240.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 333.75, + "completions/mean_terminated_length": 333.75, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.33296439771259917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.03335838089697063, + "learning_rate": 1.9733219315283245e-05, + "loss": 0.0013, + "num_tokens": 15034678.0, + "reward": 1.921875, + "reward_std": 0.22097086906433105, + "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 142.625, + "completions/mean_terminated_length": 142.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.33314886552296624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.10374193405732512, + "learning_rate": 1.9732480039943133e-05, + "loss": 0.0041, + "num_tokens": 15038603.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 253.875, + "completions/mean_terminated_length": 253.875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.08150022150948644, + "learning_rate": 1.9731739755604217e-05, + "loss": 0.0033, + "num_tokens": 15043818.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 483.125, + "completions/mean_terminated_length": 483.125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.33351780114370044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.04728736914694309, + "learning_rate": 1.9730998462343237e-05, + "loss": 0.0019, + "num_tokens": 15054699.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 322.875, + "completions/mean_terminated_length": 322.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.3337022689540675, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.04933475377038121, + "learning_rate": 1.9730256160237047e-05, + "loss": 0.002, + "num_tokens": 15060906.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 342.25, + "completions/mean_terminated_length": 342.25, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.3338867367644346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.05992202879860997, + "learning_rate": 1.9729512849362607e-05, + "loss": 0.0024, + "num_tokens": 15070060.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 384.375, + "completions/mean_terminated_length": 384.375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.3340712045748017, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.05250877724029124, + "learning_rate": 1.9728768529796976e-05, + "loss": 0.0021, + "num_tokens": 15077599.0, + "reward": 1.65816330909729, + "reward_std": 0.21973492205142975, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6581632494926453, + "rewards/fixed_code_pass_all_test_reward/std": 0.21973496675491333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 248.625, + "completions/mean_terminated_length": 248.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.3342556723851688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.09539045346900821, + "learning_rate": 1.9728023201617326e-05, + "loss": 0.0038, + "num_tokens": 15086804.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 433.0, + "completions/mean_terminated_length": 433.0, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.33444014019553586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.06113663408905268, + "learning_rate": 1.972727686490092e-05, + "loss": 0.0024, + "num_tokens": 15102708.0, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, + "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 382.875, + "completions/mean_terminated_length": 382.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.334624608005903, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.04856591857969761, + "learning_rate": 1.9726529519725136e-05, + "loss": 0.0019, + "num_tokens": 15113051.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 356.375, + "completions/mean_terminated_length": 356.375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.33480907581627006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.03870244463905692, + "learning_rate": 1.9725781166167452e-05, + "loss": 0.0015, + "num_tokens": 15119550.0, + "reward": 1.9500000476837158, + "reward_std": 0.09258202463388443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 309.5, + "completions/mean_terminated_length": 309.5, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.33499354362663714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.04484340222552419, + "learning_rate": 1.9725031804305454e-05, + "loss": 0.0018, + "num_tokens": 15128954.0, + "reward": 1.7799999713897705, + "reward_std": 0.40736088156700134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7799999713897705, + "rewards/fixed_code_pass_all_test_reward/std": 0.40736085176467896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 146.625, + "completions/mean_terminated_length": 146.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.33517801143700426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.06208996404893696, + "learning_rate": 1.9724281434216836e-05, + "loss": 0.0025, + "num_tokens": 15132991.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 535.5, + "completions/mean_terminated_length": 535.5, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.33536247924737134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.609375, + "kl": 0.029938201420009136, + "learning_rate": 1.972353005597938e-05, + "loss": 0.0012, + "num_tokens": 15143627.0, + "reward": 1.3928571939468384, + "reward_std": 0.2753211557865143, + "rewards/fixed_code_pass_all_test_reward/mean": 0.392857164144516, + "rewards/fixed_code_pass_all_test_reward/std": 0.27532118558883667, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 350.375, + "completions/mean_terminated_length": 350.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.3355469470577384, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.06490082759410143, + "learning_rate": 1.9722777669670995e-05, + "loss": 0.0026, + "num_tokens": 15153150.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 261.375, + "completions/mean_terminated_length": 261.375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.33573141486810554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.10711404168978333, + "learning_rate": 1.9722024275369677e-05, + "loss": 0.0043, + "num_tokens": 15158097.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 450.25, + "completions/mean_terminated_length": 450.25, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.3359158826784726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.0272505545290187, + "learning_rate": 1.9721269873153535e-05, + "loss": 0.0011, + "num_tokens": 15166827.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 255.25, + "completions/mean_terminated_length": 255.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.3361003504888397, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.05219823378138244, + "learning_rate": 1.9720514463100783e-05, + "loss": 0.0021, + "num_tokens": 15178613.0, + "reward": 1.495192289352417, + "reward_std": 0.20397312939167023, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4951923191547394, + "rewards/fixed_code_pass_all_test_reward/std": 0.20397312939167023, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 211.25, + "completions/mean_terminated_length": 211.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.3362848182992068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.06762572703883052, + "learning_rate": 1.971975804528973e-05, + "loss": 0.0027, + "num_tokens": 15183159.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.3364692861095739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.07368692522868514, + "learning_rate": 1.9719000619798804e-05, + "loss": 0.0029, + "num_tokens": 15187852.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 385.25, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.33665375391994096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.07078687148168683, + "learning_rate": 1.971824218670652e-05, + "loss": 0.0028, + "num_tokens": 15197614.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 417.375, + "completions/mean_terminated_length": 417.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.3368382217303081, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.07483380509074777, + "learning_rate": 1.971748274609152e-05, + "loss": 0.003, + "num_tokens": 15207105.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 249.5, + "completions/mean_terminated_length": 249.5, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.33702268954067516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1572265625, + "kl": 0.08799533522687852, + "learning_rate": 1.9716722298032528e-05, + "loss": 0.0035, + "num_tokens": 15212757.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 141.25, + "completions/mean_terminated_length": 141.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.33720715735104223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.271484375, + "kl": 0.07945042056962848, + "learning_rate": 1.9715960842608385e-05, + "loss": 0.0032, + "num_tokens": 15216615.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 442.125, + "completions/mean_terminated_length": 442.125, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.33739162516140936, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.01196912198793143, + "learning_rate": 1.9715198379898036e-05, + "loss": 0.0005, + "num_tokens": 15225192.0, + "reward": 1.1500000953674316, + "reward_std": 0.053452279418706894, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, + "rewards/fixed_code_pass_all_test_reward/std": 0.053452249616384506, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 190.75, + "completions/mean_terminated_length": 190.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.33757609297177643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.03773574670776725, + "learning_rate": 1.9714434909980524e-05, + "loss": 0.0015, + "num_tokens": 15229638.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 457.375, + "completions/mean_terminated_length": 457.375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.3377605607821435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.02060489682480693, + "learning_rate": 1.9713670432935008e-05, + "loss": 0.0008, + "num_tokens": 15238233.0, + "reward": 1.875, + "reward_std": 0.2082483023405075, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2082482874393463, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 369.375, + "completions/mean_terminated_length": 369.375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.33794502859251063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.05617201211862266, + "learning_rate": 1.971290494884073e-05, + "loss": 0.0022, + "num_tokens": 15248348.0, + "reward": 1.7750000953674316, + "reward_std": 0.310529500246048, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 297.0, + "completions/mean_terminated_length": 297.0, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.3381294964028777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.06647268310189247, + "learning_rate": 1.9712138457777067e-05, + "loss": 0.0027, + "num_tokens": 15257724.0, + "reward": 1.5592105388641357, + "reward_std": 0.38669759035110474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5592105388641357, + "rewards/fixed_code_pass_all_test_reward/std": 0.38669759035110474, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 335.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3383139642132448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.021635653014527634, + "learning_rate": 1.9711370959823472e-05, + "loss": 0.0009, + "num_tokens": 15264471.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 317.625, + "completions/mean_terminated_length": 317.625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.3384984320236119, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.06669621355831623, + "learning_rate": 1.9710602455059516e-05, + "loss": 0.0027, + "num_tokens": 15275620.0, + "reward": 1.2727272510528564, + "reward_std": 0.3331364691257477, + "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, + "rewards/fixed_code_pass_all_test_reward/std": 0.33313652873039246, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 489.375, + "completions/mean_terminated_length": 266.71429443359375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.338682899833979, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.53125, + "kl": 0.07141411933116615, + "learning_rate": 1.9709832943564874e-05, + "loss": 0.0029, + "num_tokens": 15287327.0, + "reward": 1.5499999523162842, + "reward_std": 0.6406718492507935, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6749999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.3043079078197479, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 333.5, + "completions/mean_terminated_length": 333.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.33886736764434605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.036501258378848433, + "learning_rate": 1.9709062425419326e-05, + "loss": 0.0015, + "num_tokens": 15293435.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 579.25, + "completions/mean_terminated_length": 579.25, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.3390518354547132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.04275131202302873, + "learning_rate": 1.9708290900702752e-05, + "loss": 0.0017, + "num_tokens": 15307453.0, + "reward": 1.4711538553237915, + "reward_std": 0.467791885137558, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4711538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.46779191493988037, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.33923630326508025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.08385665202513337, + "learning_rate": 1.9707518369495138e-05, + "loss": 0.0034, + "num_tokens": 15315083.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 498.625, + "completions/mean_terminated_length": 498.625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.3394207710754473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.05743088014423847, + "learning_rate": 1.9706744831876576e-05, + "loss": 0.0023, + "num_tokens": 15323984.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 278.125, + "completions/mean_terminated_length": 278.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.3396052388858144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.0571156470105052, + "learning_rate": 1.970597028792726e-05, + "loss": 0.0023, + "num_tokens": 15330033.0, + "reward": 1.6101974248886108, + "reward_std": 0.22450506687164307, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6101974248886108, + "rewards/fixed_code_pass_all_test_reward/std": 0.22450508177280426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 710.5, + "completions/mean_terminated_length": 710.5, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "epoch": 0.3397897066961815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71875, + "kl": 0.03120402479544282, + "learning_rate": 1.9705194737727492e-05, + "loss": 0.0012, + "num_tokens": 15342869.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.3399741745065486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30078125, + "kl": 0.08945373119786382, + "learning_rate": 1.9704418181357675e-05, + "loss": 0.0036, + "num_tokens": 15347114.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 220.375, + "completions/mean_terminated_length": 220.375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.34015864231691567, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.05138180451467633, + "learning_rate": 1.9703640618898313e-05, + "loss": 0.0021, + "num_tokens": 15351717.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.3403431101272828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.04971617856062949, + "learning_rate": 1.9702862050430024e-05, + "loss": 0.002, + "num_tokens": 15359792.0, + "reward": 1.21875, + "reward_std": 0.23385359346866608, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21875, + "rewards/fixed_code_pass_all_test_reward/std": 0.23385359346866608, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 259.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3405275779376499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.07072699163109064, + "learning_rate": 1.9702082476033522e-05, + "loss": 0.0028, + "num_tokens": 15367709.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 242.625, + "completions/mean_terminated_length": 242.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.34071204574801695, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.03315013169776648, + "learning_rate": 1.9701301895789627e-05, + "loss": 0.0013, + "num_tokens": 15373210.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 377.75, + "completions/mean_terminated_length": 377.75, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.3408965135583841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.62890625, + "kl": 0.0798835544846952, + "learning_rate": 1.9700520309779268e-05, + "loss": 0.0032, + "num_tokens": 15381600.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 238.875, + "completions/mean_terminated_length": 238.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.34108098136875115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.06389265088364482, + "learning_rate": 1.9699737718083475e-05, + "loss": 0.0026, + "num_tokens": 15390767.0, + "reward": 1.7190594673156738, + "reward_std": 0.40474647283554077, + "rewards/fixed_code_pass_all_test_reward/mean": 0.719059407711029, + "rewards/fixed_code_pass_all_test_reward/std": 0.40474650263786316, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 141.25, + "completions/mean_terminated_length": 141.25, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.3412654491791182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.03468956728465855, + "learning_rate": 1.9698954120783378e-05, + "loss": 0.0014, + "num_tokens": 15394609.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 721.125, + "completions/mean_terminated_length": 721.125, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "epoch": 0.34144991698948535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.040311090648174286, + "learning_rate": 1.9698169517960216e-05, + "loss": 0.0016, + "num_tokens": 15412226.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 260.75, + "completions/mean_terminated_length": 260.75, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.3416343847998524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.061570935882627964, + "learning_rate": 1.9697383909695332e-05, + "loss": 0.0025, + "num_tokens": 15420096.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 228.5, + "completions/mean_terminated_length": 228.5, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3418188526102195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.03358614759054035, + "learning_rate": 1.9696597296070177e-05, + "loss": 0.0013, + "num_tokens": 15425860.0, + "reward": 1.984375, + "reward_std": 0.04419417306780815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, + "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 246.75, + "completions/mean_terminated_length": 246.75, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.3420033204205866, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.06291339313611388, + "learning_rate": 1.9695809677166294e-05, + "loss": 0.0025, + "num_tokens": 15436938.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 410.25, + "completions/mean_terminated_length": 410.25, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.3421877882309537, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.02087060033227317, + "learning_rate": 1.969502105306534e-05, + "loss": 0.0008, + "num_tokens": 15445556.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.34237225604132077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.047828986775130033, + "learning_rate": 1.9694231423849083e-05, + "loss": 0.0019, + "num_tokens": 15453219.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.3425567238516879, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.07386017125099897, + "learning_rate": 1.9693440789599373e-05, + "loss": 0.003, + "num_tokens": 15459350.0, + "reward": 1.5586735010147095, + "reward_std": 0.35565075278282166, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5586735010147095, + "rewards/fixed_code_pass_all_test_reward/std": 0.35565078258514404, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.34274119166205497, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.07171640545129776, + "learning_rate": 1.969264915039819e-05, + "loss": 0.0029, + "num_tokens": 15466491.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 292.875, + "completions/mean_terminated_length": 292.875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.34292565947242204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.05277696577832103, + "learning_rate": 1.9691856506327595e-05, + "loss": 0.0021, + "num_tokens": 15475882.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 168.125, + "completions/mean_terminated_length": 168.125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.34311012728278917, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.07378734485246241, + "learning_rate": 1.9691062857469773e-05, + "loss": 0.003, + "num_tokens": 15480139.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 264.5, + "completions/mean_terminated_length": 264.5, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.34329459509315624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.06306059774942696, + "learning_rate": 1.9690268203907e-05, + "loss": 0.0025, + "num_tokens": 15486311.0, + "reward": 1.625, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 292.875, + "completions/mean_terminated_length": 292.875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.3434790629035233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.05596954328939319, + "learning_rate": 1.968947254572166e-05, + "loss": 0.0022, + "num_tokens": 15491430.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 180.75, + "completions/mean_terminated_length": 180.75, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.34366353071389044, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.08689720742404461, + "learning_rate": 1.9688675882996243e-05, + "loss": 0.0035, + "num_tokens": 15495660.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1436.0, + "completions/max_terminated_length": 1436.0, + "completions/mean_length": 545.875, + "completions/mean_terminated_length": 545.875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.3438479985242575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.251953125, + "kl": 0.03413332987111062, + "learning_rate": 1.968787821581334e-05, + "loss": 0.0014, + "num_tokens": 15503987.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 234.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3440324663346246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.05155676091089845, + "learning_rate": 1.9687079544255653e-05, + "loss": 0.0021, + "num_tokens": 15508773.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 342.25, + "completions/mean_terminated_length": 342.25, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.3442169341449917, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.05855845287442207, + "learning_rate": 1.9686279868405975e-05, + "loss": 0.0023, + "num_tokens": 15515951.0, + "reward": 1.3691861629486084, + "reward_std": 0.2548873722553253, + "rewards/fixed_code_pass_all_test_reward/mean": 0.36918604373931885, + "rewards/fixed_code_pass_all_test_reward/std": 0.25488734245300293, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 292.875, + "completions/mean_terminated_length": 292.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3444014019553588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5546875, + "kl": 0.09417074674274772, + "learning_rate": 1.9685479188347214e-05, + "loss": 0.0038, + "num_tokens": 15524502.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 286.0, + "completions/mean_terminated_length": 286.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.34458586976572586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.058050604071468115, + "learning_rate": 1.9684677504162384e-05, + "loss": 0.0023, + "num_tokens": 15534222.0, + "reward": 1.8607594966888428, + "reward_std": 0.19316022098064423, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8607594966888428, + "rewards/fixed_code_pass_all_test_reward/std": 0.19316023588180542, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 497.5, + "completions/mean_terminated_length": 497.5, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.344770337576093, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76171875, + "kl": 0.04016149416565895, + "learning_rate": 1.9683874815934595e-05, + "loss": 0.0016, + "num_tokens": 15544026.0, + "reward": 1.2300000190734863, + "reward_std": 0.26251524686813354, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23000000417232513, + "rewards/fixed_code_pass_all_test_reward/std": 0.2625153064727783, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 253.625, + "completions/mean_terminated_length": 253.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.34495480538646006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.04399755736812949, + "learning_rate": 1.968307112374706e-05, + "loss": 0.0018, + "num_tokens": 15553175.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 475.0, + "completions/mean_terminated_length": 475.0, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.34513927319682713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.0354848129209131, + "learning_rate": 1.9682266427683107e-05, + "loss": 0.0014, + "num_tokens": 15562503.0, + "reward": 1.6527777910232544, + "reward_std": 0.4840944707393646, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6527777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.4840944707393646, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 307.25, + "completions/mean_terminated_length": 307.25, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.34532374100719426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.03553930693306029, + "learning_rate": 1.968146072782616e-05, + "loss": 0.0014, + "num_tokens": 15572017.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 329.625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.34550820881756134, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201171875, + "kl": 0.048338708467781544, + "learning_rate": 1.9680654024259746e-05, + "loss": 0.0019, + "num_tokens": 15582894.0, + "reward": 1.7000000476837158, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.699999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 238.5, + "completions/mean_terminated_length": 238.5, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3456926766279284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.123046875, + "kl": 0.07934860605746508, + "learning_rate": 1.9679846317067502e-05, + "loss": 0.0032, + "num_tokens": 15591386.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 272.625, + "completions/mean_terminated_length": 272.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.34587714443829554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.05576801672577858, + "learning_rate": 1.967903760633316e-05, + "loss": 0.0022, + "num_tokens": 15599439.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 307.25, + "completions/mean_terminated_length": 307.25, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.3460616122486626, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.04841893323464319, + "learning_rate": 1.967822789214057e-05, + "loss": 0.0019, + "num_tokens": 15608273.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 609.5, + "completions/mean_terminated_length": 609.5, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.3462460800590297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.03979158797301352, + "learning_rate": 1.967741717457367e-05, + "loss": 0.0016, + "num_tokens": 15623437.0, + "reward": 1.2848360538482666, + "reward_std": 0.24108274281024933, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2848360538482666, + "rewards/fixed_code_pass_all_test_reward/std": 0.24108275771141052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 376.875, + "completions/mean_terminated_length": 376.875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.3464305478693968, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.02918151766061783, + "learning_rate": 1.9676605453716516e-05, + "loss": 0.0012, + "num_tokens": 15631236.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.3466150156797639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.041618830524384975, + "learning_rate": 1.9675792729653256e-05, + "loss": 0.0017, + "num_tokens": 15635515.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 328.875, + "completions/mean_terminated_length": 328.875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.34679948349013096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83984375, + "kl": 0.03911396209150553, + "learning_rate": 1.9674979002468152e-05, + "loss": 0.0016, + "num_tokens": 15645434.0, + "reward": 1.5, + "reward_std": 0.1781741827726364, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.1781741827726364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 365.625, + "completions/mean_terminated_length": 365.625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.3469839513004981, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.0460129554849118, + "learning_rate": 1.9674164272245565e-05, + "loss": 0.0018, + "num_tokens": 15656191.0, + "reward": 1.6875, + "reward_std": 0.3260718882083893, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.32607191801071167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.34716841911086516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.057686587795615196, + "learning_rate": 1.967334853906996e-05, + "loss": 0.0023, + "num_tokens": 15665486.0, + "reward": 1.7053571939468384, + "reward_std": 0.29435279965400696, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7053571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.29435282945632935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 420.375, + "completions/mean_terminated_length": 420.375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.34735288692123223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.03906270093284547, + "learning_rate": 1.9672531803025913e-05, + "loss": 0.0016, + "num_tokens": 15674153.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 342.375, + "completions/mean_terminated_length": 342.375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.34753735473159936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.049105466809123755, + "learning_rate": 1.9671714064198086e-05, + "loss": 0.002, + "num_tokens": 15687060.0, + "reward": 1.2715516090393066, + "reward_std": 0.44976547360420227, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3965517282485962, + "rewards/fixed_code_pass_all_test_reward/std": 0.49970266222953796, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 655.75, + "completions/mean_terminated_length": 655.75, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "epoch": 0.34772182254196643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8203125, + "kl": 0.022958481684327126, + "learning_rate": 1.9670895322671263e-05, + "loss": 0.0009, + "num_tokens": 15699162.0, + "reward": 1.1458332538604736, + "reward_std": 0.058925557881593704, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1458333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 164.625, + "completions/mean_terminated_length": 164.625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.3479062903523335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.05411185370758176, + "learning_rate": 1.9670075578530325e-05, + "loss": 0.0022, + "num_tokens": 15703223.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 353.25, + "completions/mean_terminated_length": 353.25, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.34809075816270063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.06141763227060437, + "learning_rate": 1.9669254831860262e-05, + "loss": 0.0025, + "num_tokens": 15711713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 252.25, + "completions/mean_terminated_length": 252.25, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3482752259730677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.0587551542557776, + "learning_rate": 1.9668433082746157e-05, + "loss": 0.0024, + "num_tokens": 15717699.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 480.625, + "completions/mean_terminated_length": 480.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.3484596937834348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.04872559616342187, + "learning_rate": 1.9667610331273205e-05, + "loss": 0.0019, + "num_tokens": 15730816.0, + "reward": 1.4464285373687744, + "reward_std": 0.14328168332576752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4464285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.1432816982269287, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 468.0, + "completions/mean_terminated_length": 468.0, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.3486441615938019, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80859375, + "kl": 0.03919334663078189, + "learning_rate": 1.966678657752671e-05, + "loss": 0.0016, + "num_tokens": 15743288.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.348828629404169, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.09010765142738819, + "learning_rate": 1.966596182159206e-05, + "loss": 0.0036, + "num_tokens": 15747801.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.34901309721453605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.06038056802935898, + "learning_rate": 1.9665136063554774e-05, + "loss": 0.0024, + "num_tokens": 15755520.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 158.375, + "completions/mean_terminated_length": 158.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3491975650249032, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.08439464960247278, + "learning_rate": 1.9664309303500455e-05, + "loss": 0.0034, + "num_tokens": 15759507.0, + "reward": 0.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.5, + "rewards/format_reward/std": 0.5345224738121033, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 337.375, + "completions/mean_terminated_length": 337.375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.34938203283527025, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.07662152126431465, + "learning_rate": 1.9663481541514814e-05, + "loss": 0.0031, + "num_tokens": 15770110.0, + "reward": 1.5597827434539795, + "reward_std": 0.22822393476963043, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5597826242446899, + "rewards/fixed_code_pass_all_test_reward/std": 0.22822390496730804, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 400.5, + "completions/mean_terminated_length": 400.5, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.3495665006456373, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.05332147283479571, + "learning_rate": 1.966265277768367e-05, + "loss": 0.0021, + "num_tokens": 15777650.0, + "reward": 1.7857143878936768, + "reward_std": 0.24414090812206268, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.24414092302322388, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 384.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.34975096845600445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.06096297223120928, + "learning_rate": 1.9661823012092945e-05, + "loss": 0.0024, + "num_tokens": 15785546.0, + "reward": 1.8250000476837158, + "reward_std": 0.27645719051361084, + "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.27645716071128845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 264.75, + "completions/mean_terminated_length": 264.75, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.3499354362663715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.07233964698389173, + "learning_rate": 1.966099224482866e-05, + "loss": 0.0029, + "num_tokens": 15793456.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 264.25, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3501199040767386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.03652283735573292, + "learning_rate": 1.966016047597695e-05, + "loss": 0.0015, + "num_tokens": 15799346.0, + "reward": 1.7999999523162842, + "reward_std": 0.38544961810112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 626.25, + "completions/mean_terminated_length": 626.25, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "epoch": 0.3503043718871057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.038280142238363624, + "learning_rate": 1.965932770562404e-05, + "loss": 0.0015, + "num_tokens": 15810212.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 262.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.3504888396974728, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.06370309367775917, + "learning_rate": 1.965849393385627e-05, + "loss": 0.0025, + "num_tokens": 15819249.0, + "reward": 1.9558823108673096, + "reward_std": 0.12478352338075638, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9558823704719543, + "rewards/fixed_code_pass_all_test_reward/std": 0.12478354573249817, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 544.5, + "completions/mean_terminated_length": 544.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.35067330750783987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.043001665151678026, + "learning_rate": 1.9657659160760078e-05, + "loss": 0.0017, + "num_tokens": 15829285.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 189.875, + "completions/mean_terminated_length": 189.875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.350857775318207, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.07267709542065859, + "learning_rate": 1.9656823386422008e-05, + "loss": 0.0029, + "num_tokens": 15833628.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 581.0, + "completions/mean_terminated_length": 371.4285888671875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.35104224312857407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.484375, + "kl": 0.05674180522328243, + "learning_rate": 1.965598661092871e-05, + "loss": 0.0023, + "num_tokens": 15843676.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 487.5, + "completions/mean_terminated_length": 487.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.35122671093894114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.05473588057793677, + "learning_rate": 1.9655148834366934e-05, + "loss": 0.0022, + "num_tokens": 15855480.0, + "reward": 1.8181817531585693, + "reward_std": 0.2201213240623474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.22012126445770264, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 327.125, + "completions/mean_terminated_length": 327.125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.3514111787493083, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.06024289480410516, + "learning_rate": 1.9654310056823534e-05, + "loss": 0.0024, + "num_tokens": 15865017.0, + "reward": 1.6470588445663452, + "reward_std": 0.7252251505851746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7720588445663452, + "rewards/fixed_code_pass_all_test_reward/std": 0.4246920347213745, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.35159564655967535, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.03684038948267698, + "learning_rate": 1.9653470278385468e-05, + "loss": 0.0015, + "num_tokens": 15869535.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 250.375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3517801143700424, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.11445109848864377, + "learning_rate": 1.96526294991398e-05, + "loss": 0.0046, + "num_tokens": 15875546.0, + "reward": 1.1875, + "reward_std": 0.4124789834022522, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255690574646, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 376.5, + "completions/mean_terminated_length": 376.5, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.3519645821804095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.06378674600273371, + "learning_rate": 1.96517877191737e-05, + "loss": 0.0026, + "num_tokens": 15884478.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 769.75, + "completions/mean_terminated_length": 769.75, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.3521490499907766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.427734375, + "kl": 0.022394536761566997, + "learning_rate": 1.9650944938574433e-05, + "loss": 0.0009, + "num_tokens": 15901252.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 410.75, + "completions/mean_terminated_length": 410.75, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.3523335178011437, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.78125, + "kl": 0.05032914981711656, + "learning_rate": 1.9650101157429377e-05, + "loss": 0.002, + "num_tokens": 15909018.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 349.875, + "completions/mean_terminated_length": 349.875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.35251798561151076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.045735320542007685, + "learning_rate": 1.9649256375826003e-05, + "loss": 0.0018, + "num_tokens": 15916193.0, + "reward": 1.8556034564971924, + "reward_std": 0.31142908334732056, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8556034564971924, + "rewards/fixed_code_pass_all_test_reward/std": 0.31142905354499817, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 438.125, + "completions/mean_terminated_length": 438.125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.3527024534218779, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.06275790487416089, + "learning_rate": 1.96484105938519e-05, + "loss": 0.0025, + "num_tokens": 15928626.0, + "reward": 1.5178571939468384, + "reward_std": 0.43153735995292664, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5178571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.43153735995292664, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.35288692123224497, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.056699561420828104, + "learning_rate": 1.964756381159475e-05, + "loss": 0.0023, + "num_tokens": 15934135.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 538.375, + "completions/mean_terminated_length": 538.375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.35307138904261204, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.02800741884857416, + "learning_rate": 1.9646716029142338e-05, + "loss": 0.0011, + "num_tokens": 15945954.0, + "reward": 1.8576922416687012, + "reward_std": 0.18493562936782837, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8576923608779907, + "rewards/fixed_code_pass_all_test_reward/std": 0.18493562936782837, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 463.875, + "completions/mean_terminated_length": 463.875, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.35325585685297917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.03093776188325137, + "learning_rate": 1.964586724658256e-05, + "loss": 0.0012, + "num_tokens": 15953385.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 581.125, + "completions/mean_terminated_length": 371.5714416503906, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.35344032466334624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.390625, + "kl": 0.029448895656969398, + "learning_rate": 1.9645017464003414e-05, + "loss": 0.0012, + "num_tokens": 15964418.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 611.0, + "completions/mean_terminated_length": 405.71429443359375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.3536247924737133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.578125, + "kl": 0.04182275087805465, + "learning_rate": 1.9644166681492997e-05, + "loss": 0.0017, + "num_tokens": 15975378.0, + "reward": 1.375, + "reward_std": 0.6648039817810059, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.4172614812850952, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 237.375, + "completions/mean_terminated_length": 237.375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.35380926028408044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.0175650320306886, + "learning_rate": 1.9643314899139514e-05, + "loss": 0.0007, + "num_tokens": 15981093.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.3539937280944475, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.05295420857146382, + "learning_rate": 1.964246211703127e-05, + "loss": 0.0021, + "num_tokens": 15990639.0, + "reward": 1.7093024253845215, + "reward_std": 0.36049190163612366, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7093023061752319, + "rewards/fixed_code_pass_all_test_reward/std": 0.36049193143844604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 460.75, + "completions/mean_terminated_length": 460.75, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.3541781959048146, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.59765625, + "kl": 0.046897446271032095, + "learning_rate": 1.9641608335256678e-05, + "loss": 0.0019, + "num_tokens": 16004421.0, + "reward": 1.9147727489471436, + "reward_std": 0.24105912446975708, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.24105913937091827, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 383.125, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.3543626637151817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.045850432477891445, + "learning_rate": 1.964075355390425e-05, + "loss": 0.0018, + "num_tokens": 16011854.0, + "reward": 1.7569444179534912, + "reward_std": 0.20126910507678986, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7569444179534912, + "rewards/fixed_code_pass_all_test_reward/std": 0.20126911997795105, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 455.5, + "completions/mean_terminated_length": 455.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.3545471315255488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.057785764336586, + "learning_rate": 1.963989777306261e-05, + "loss": 0.0023, + "num_tokens": 16023890.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 349.5, + "completions/mean_terminated_length": 349.5, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.35473159933591586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.05538790486752987, + "learning_rate": 1.963904099282047e-05, + "loss": 0.0022, + "num_tokens": 16033494.0, + "reward": 1.6624999046325684, + "reward_std": 0.1767766773700714, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6624999642372131, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 425.0, + "completions/mean_terminated_length": 425.0, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.354916067146283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.03891425649635494, + "learning_rate": 1.9638183213266665e-05, + "loss": 0.0016, + "num_tokens": 16044286.0, + "reward": 1.5283019542694092, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5283018946647644, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 108.875, + "completions/mean_terminated_length": 108.875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.35510053495665006, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.11269979272037745, + "learning_rate": 1.9637324434490116e-05, + "loss": 0.0045, + "num_tokens": 16047861.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 400.75, + "completions/mean_terminated_length": 400.75, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.35528500276701713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.058071716921404004, + "learning_rate": 1.9636464656579865e-05, + "loss": 0.0023, + "num_tokens": 16058915.0, + "reward": 1.9287974834442139, + "reward_std": 0.20139116048812866, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9287974834442139, + "rewards/fixed_code_pass_all_test_reward/std": 0.20139117538928986, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 639.25, + "completions/mean_terminated_length": 639.25, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.35546947057738426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.03500722371973097, + "learning_rate": 1.9635603879625034e-05, + "loss": 0.0014, + "num_tokens": 16069469.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 136.875, + "completions/mean_terminated_length": 136.875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.35565393838775133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.017614608630537987, + "learning_rate": 1.9634742103714877e-05, + "loss": 0.0007, + "num_tokens": 16073340.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 306.875, + "completions/mean_terminated_length": 306.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.3558384061981184, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.06307699484750628, + "learning_rate": 1.9633879328938724e-05, + "loss": 0.0025, + "num_tokens": 16084875.0, + "reward": 1.9861111640930176, + "reward_std": 0.03928373008966446, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 385.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.35602287400848553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.05593383149243891, + "learning_rate": 1.9633015555386033e-05, + "loss": 0.0022, + "num_tokens": 16096109.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 440.75, + "completions/mean_terminated_length": 440.75, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.3562073418188526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.044489318039268255, + "learning_rate": 1.9632150783146353e-05, + "loss": 0.0018, + "num_tokens": 16104179.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 392.5, + "completions/mean_terminated_length": 392.5, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.3563918096292197, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80859375, + "kl": 0.016771857510320842, + "learning_rate": 1.9631285012309332e-05, + "loss": 0.0007, + "num_tokens": 16111735.0, + "reward": 1.7613637447357178, + "reward_std": 0.32934945821762085, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7613636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.32934945821762085, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 224.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3565762774395868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.07636563898995519, + "learning_rate": 1.9630418242964727e-05, + "loss": 0.0031, + "num_tokens": 16117209.0, + "reward": 1.5333333015441895, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5333333611488342, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 465.625, + "completions/mean_terminated_length": 465.625, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.3567607452499539, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6953125, + "kl": 0.02686226787045598, + "learning_rate": 1.9629550475202403e-05, + "loss": 0.0011, + "num_tokens": 16126486.0, + "reward": 1.6354167461395264, + "reward_std": 0.08838837593793869, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6354166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883610367775, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 344.125, + "completions/mean_terminated_length": 344.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.35694521306032095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.0657939207740128, + "learning_rate": 1.9628681709112326e-05, + "loss": 0.0026, + "num_tokens": 16135031.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 574.75, + "completions/mean_terminated_length": 574.75, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.3571296808706881, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.11114898091182113, + "learning_rate": 1.962781194478456e-05, + "loss": 0.0044, + "num_tokens": 16148621.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 228.75, + "completions/mean_terminated_length": 228.75, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.35731414868105515, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1328125, + "kl": 0.11419446673244238, + "learning_rate": 1.962694118230928e-05, + "loss": 0.0046, + "num_tokens": 16153459.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 348.875, + "completions/mean_terminated_length": 348.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.3574986164914222, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.06929788598790765, + "learning_rate": 1.9626069421776753e-05, + "loss": 0.0028, + "num_tokens": 16162490.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.35768308430178936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.07319977739825845, + "learning_rate": 1.9625196663277368e-05, + "loss": 0.0029, + "num_tokens": 16171397.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 258.125, + "completions/mean_terminated_length": 258.125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.35786755211215643, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26171875, + "kl": 0.038997467316221446, + "learning_rate": 1.9624322906901596e-05, + "loss": 0.0016, + "num_tokens": 16177030.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 220.75, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.3580520199225235, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.027502761920914054, + "learning_rate": 1.962344815274003e-05, + "loss": 0.0011, + "num_tokens": 16182124.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 293.375, + "completions/mean_terminated_length": 293.375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.35823648773289063, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1474609375, + "kl": 0.055199426133185625, + "learning_rate": 1.962257240088336e-05, + "loss": 0.0022, + "num_tokens": 16192655.0, + "reward": 1.25, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 303.5, + "completions/mean_terminated_length": 303.5, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.3584209555432577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.04376601963303983, + "learning_rate": 1.962169565142237e-05, + "loss": 0.0018, + "num_tokens": 16199347.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 353.375, + "completions/mean_terminated_length": 353.375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.3586054233536248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.04284642159473151, + "learning_rate": 1.9620817904447964e-05, + "loss": 0.0017, + "num_tokens": 16206270.0, + "reward": 1.98828125, + "reward_std": 0.03314562886953354, + "rewards/fixed_code_pass_all_test_reward/mean": 0.98828125, + "rewards/fixed_code_pass_all_test_reward/std": 0.03314562886953354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 244.25, + "completions/mean_terminated_length": 244.25, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.3587898911639919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.037382343667559326, + "learning_rate": 1.9619939160051136e-05, + "loss": 0.0015, + "num_tokens": 16211536.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 321.375, + "completions/mean_terminated_length": 321.375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.358974358974359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.015462528681382537, + "learning_rate": 1.9619059418322987e-05, + "loss": 0.0006, + "num_tokens": 16218195.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 355.875, + "completions/mean_terminated_length": 355.875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.35915882678472605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.022646317083854228, + "learning_rate": 1.961817867935473e-05, + "loss": 0.0009, + "num_tokens": 16224634.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 245.25, + "completions/mean_terminated_length": 245.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.3593432945950932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.04282309440895915, + "learning_rate": 1.9617296943237667e-05, + "loss": 0.0017, + "num_tokens": 16230556.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 250.5, + "completions/mean_terminated_length": 250.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.35952776240546025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.019059712416492403, + "learning_rate": 1.961641421006321e-05, + "loss": 0.0008, + "num_tokens": 16235480.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.3597122302158273, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.054892236832529306, + "learning_rate": 1.9615530479922885e-05, + "loss": 0.0022, + "num_tokens": 16240385.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.35989669802619445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.0931207686662674, + "learning_rate": 1.96146457529083e-05, + "loss": 0.0037, + "num_tokens": 16248357.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 211.375, + "completions/mean_terminated_length": 211.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.3600811658365615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.06492138048633933, + "learning_rate": 1.9613760029111183e-05, + "loss": 0.0026, + "num_tokens": 16255192.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1850.0, + "completions/max_terminated_length": 1850.0, + "completions/mean_length": 684.5, + "completions/mean_terminated_length": 684.5, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.3602656336469286, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.02135612870915793, + "learning_rate": 1.9612873308623355e-05, + "loss": 0.0009, + "num_tokens": 16264196.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 303.125, + "completions/mean_terminated_length": 303.125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.3604501014572957, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.027234005508944392, + "learning_rate": 1.9611985591536755e-05, + "loss": 0.0011, + "num_tokens": 16270749.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 321.0, + "completions/mean_terminated_length": 321.0, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.3606345692676628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.04629516368731856, + "learning_rate": 1.9611096877943404e-05, + "loss": 0.0019, + "num_tokens": 16277405.0, + "reward": 1.6938775777816772, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6938775777816772, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 436.625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.36081903707802987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.05965818534605205, + "learning_rate": 1.9610207167935446e-05, + "loss": 0.0024, + "num_tokens": 16287906.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 369.5, + "completions/mean_terminated_length": 369.5, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.361003504888397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.04203966353088617, + "learning_rate": 1.960931646160512e-05, + "loss": 0.0017, + "num_tokens": 16298966.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 430.125, + "completions/mean_terminated_length": 430.125, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.36118797269876407, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.09375, + "kl": 0.10930985654704273, + "learning_rate": 1.9608424759044763e-05, + "loss": 0.0044, + "num_tokens": 16306887.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 499.0, + "completions/mean_terminated_length": 499.0, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.36137244050913114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71875, + "kl": 0.027180141711141914, + "learning_rate": 1.9607532060346828e-05, + "loss": 0.0011, + "num_tokens": 16319527.0, + "reward": 1.2291666269302368, + "reward_std": 0.7586581110954285, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.29574811458587646, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 595.5, + "completions/mean_terminated_length": 595.5, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "epoch": 0.36155690831949827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.59375, + "kl": 0.021035279147326946, + "learning_rate": 1.9606638365603855e-05, + "loss": 0.0008, + "num_tokens": 16333363.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 367.375, + "completions/mean_terminated_length": 367.375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.36174137612986534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.05404191114939749, + "learning_rate": 1.9605743674908506e-05, + "loss": 0.0022, + "num_tokens": 16341038.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 284.125, + "completions/mean_terminated_length": 284.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.3619258439402324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.07245515380054712, + "learning_rate": 1.9604847988353528e-05, + "loss": 0.0029, + "num_tokens": 16350591.0, + "reward": 1.383928656578064, + "reward_std": 0.3873688876628876, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3839285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.3873688876628876, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 282.5, + "completions/mean_terminated_length": 282.5, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.36211031175059955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.056247488828375936, + "learning_rate": 1.9603951306031787e-05, + "loss": 0.0023, + "num_tokens": 16360707.0, + "reward": 1.7939189672470093, + "reward_std": 0.09438751637935638, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7939189672470093, + "rewards/fixed_code_pass_all_test_reward/std": 0.09438753128051758, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 406.625, + "completions/mean_terminated_length": 406.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.3622947795609666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017333984375, + "kl": 0.01138675882248208, + "learning_rate": 1.960305362803624e-05, + "loss": 0.0005, + "num_tokens": 16368216.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 176.5, + "completions/mean_terminated_length": 176.5, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.3624792473713337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.022903859382495284, + "learning_rate": 1.960215495445996e-05, + "loss": 0.0009, + "num_tokens": 16372628.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 424.0, + "completions/mean_terminated_length": 424.0, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.3626637151817008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.048770251451060176, + "learning_rate": 1.96012552853961e-05, + "loss": 0.002, + "num_tokens": 16385204.0, + "reward": 1.6815476417541504, + "reward_std": 0.1504165232181549, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6815476417541504, + "rewards/fixed_code_pass_all_test_reward/std": 0.15041649341583252, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 431.625, + "completions/mean_terminated_length": 431.625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.3628481829920679, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.03717265767045319, + "learning_rate": 1.960035462093795e-05, + "loss": 0.0015, + "num_tokens": 16393881.0, + "reward": 1.828125, + "reward_std": 0.22097086906433105, + "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, + "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 451.5, + "completions/mean_terminated_length": 451.5, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.36303265080243496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.81640625, + "kl": 0.046876417472958565, + "learning_rate": 1.959945296117887e-05, + "loss": 0.0019, + "num_tokens": 16406581.0, + "reward": 1.0508474111557007, + "reward_std": 0.3406737148761749, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17584745585918427, + "rewards/fixed_code_pass_all_test_reward/std": 0.061404142528772354, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.3632171186128021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.039232010720297694, + "learning_rate": 1.959855030621235e-05, + "loss": 0.0016, + "num_tokens": 16411691.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 404.75, + "completions/mean_terminated_length": 404.75, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.36340158642316917, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023681640625, + "kl": 0.02652712434064597, + "learning_rate": 1.959764665613196e-05, + "loss": 0.0011, + "num_tokens": 16421193.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 350.75, + "completions/mean_terminated_length": 350.75, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.36358605423353624, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.046083953231573105, + "learning_rate": 1.9596742011031394e-05, + "loss": 0.0018, + "num_tokens": 16432591.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 202.875, + "completions/mean_terminated_length": 202.875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.36377052204390337, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.07487546419724822, + "learning_rate": 1.9595836371004434e-05, + "loss": 0.003, + "num_tokens": 16438014.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 225.375, + "completions/mean_terminated_length": 225.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.36395498985427044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.040786647354252636, + "learning_rate": 1.9594929736144978e-05, + "loss": 0.0016, + "num_tokens": 16446417.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 158.125, + "completions/mean_terminated_length": 158.125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.3641394576646375, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.05210401746444404, + "learning_rate": 1.9594022106547007e-05, + "loss": 0.0021, + "num_tokens": 16452786.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 385.0, + "completions/mean_terminated_length": 385.0, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.3643239254750046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.053174142027273774, + "learning_rate": 1.959311348230463e-05, + "loss": 0.0021, + "num_tokens": 16462706.0, + "reward": 1.6607142686843872, + "reward_std": 0.527315080165863, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.24743583798408508, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 473.125, + "completions/mean_terminated_length": 248.1428680419922, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.3645083932853717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.04191380526754074, + "learning_rate": 1.9592203863512038e-05, + "loss": 0.0017, + "num_tokens": 16474083.0, + "reward": 1.1011905670166016, + "reward_std": 0.4611133635044098, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2261904776096344, + "rewards/fixed_code_pass_all_test_reward/std": 0.15165644884109497, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 351.5, + "completions/mean_terminated_length": 351.5, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.3646928610957388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.09459242376033217, + "learning_rate": 1.9591293250263542e-05, + "loss": 0.0038, + "num_tokens": 16480655.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 258.875, + "completions/mean_terminated_length": 258.875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.36487732890610586, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.05401582596823573, + "learning_rate": 1.9590381642653546e-05, + "loss": 0.0022, + "num_tokens": 16489318.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 341.125, + "completions/mean_terminated_length": 341.125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.365061796716473, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.11960953380912542, + "learning_rate": 1.9589469040776554e-05, + "loss": 0.0048, + "num_tokens": 16500055.0, + "reward": 1.3636363744735718, + "reward_std": 0.2571297585964203, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.2571297585964203, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 509.0, + "completions/mean_terminated_length": 509.0, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.36524626452684006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.60546875, + "kl": 0.029337375657632947, + "learning_rate": 1.9588555444727186e-05, + "loss": 0.0012, + "num_tokens": 16512055.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 223.125, + "completions/mean_terminated_length": 223.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.36543073233720713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.021767294965684414, + "learning_rate": 1.9587640854600155e-05, + "loss": 0.0009, + "num_tokens": 16516920.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 506.0, + "completions/mean_terminated_length": 506.0, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.36561520014757426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.03980931709520519, + "learning_rate": 1.9586725270490277e-05, + "loss": 0.0016, + "num_tokens": 16525264.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 199.75, + "completions/mean_terminated_length": 199.75, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.36579966795794133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.04554255283437669, + "learning_rate": 1.9585808692492477e-05, + "loss": 0.0018, + "num_tokens": 16529598.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 258.625, + "completions/mean_terminated_length": 258.625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.3659841357683084, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.01147538810619153, + "learning_rate": 1.958489112070178e-05, + "loss": 0.0005, + "num_tokens": 16535219.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 451.0, + "completions/mean_terminated_length": 451.0, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.36616860357867553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.051714921137318015, + "learning_rate": 1.958397255521331e-05, + "loss": 0.0021, + "num_tokens": 16548395.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 395.625, + "completions/mean_terminated_length": 395.625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.3663530713890426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.05783073417842388, + "learning_rate": 1.9583052996122297e-05, + "loss": 0.0023, + "num_tokens": 16556432.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 389.375, + "completions/mean_terminated_length": 389.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3665375391994097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.05736004235222936, + "learning_rate": 1.9582132443524084e-05, + "loss": 0.0023, + "num_tokens": 16567251.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 632.25, + "completions/mean_terminated_length": 632.25, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "epoch": 0.3667220070097768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.04524946981109679, + "learning_rate": 1.9581210897514097e-05, + "loss": 0.0018, + "num_tokens": 16583005.0, + "reward": 1.5703125, + "reward_std": 0.4869244694709778, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3314563035964966, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 239.5, + "completions/mean_terminated_length": 239.5, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.3669064748201439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.0239481118042022, + "learning_rate": 1.9580288358187882e-05, + "loss": 0.001, + "num_tokens": 16587889.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.36709094263051095, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.07658073888160288, + "learning_rate": 1.9579364825641082e-05, + "loss": 0.0031, + "num_tokens": 16592294.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 429.875, + "completions/mean_terminated_length": 429.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.3672754104408781, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.03958742739632726, + "learning_rate": 1.957844029996944e-05, + "loss": 0.0016, + "num_tokens": 16601677.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 214.5, + "completions/mean_terminated_length": 214.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.36745987825124515, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.08483300544321537, + "learning_rate": 1.95775147812688e-05, + "loss": 0.0034, + "num_tokens": 16607393.0, + "reward": 1.4328703880310059, + "reward_std": 0.4696279764175415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43287038803100586, + "rewards/fixed_code_pass_all_test_reward/std": 0.4696279466152191, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 557.25, + "completions/mean_terminated_length": 344.2857360839844, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.3676443460616122, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.59375, + "kl": 0.04511245700996369, + "learning_rate": 1.9576588269635123e-05, + "loss": 0.0018, + "num_tokens": 16616739.0, + "reward": 1.4821429252624512, + "reward_std": 0.744023859500885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.5050762891769409, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.36782881387197935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.06390029191970825, + "learning_rate": 1.9575660765164462e-05, + "loss": 0.0026, + "num_tokens": 16625886.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 278.75, + "completions/mean_terminated_length": 278.75, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.3680132816823464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.049923276295885444, + "learning_rate": 1.9574732267952972e-05, + "loss": 0.002, + "num_tokens": 16633764.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3681977494927135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.07434457261115313, + "learning_rate": 1.957380277809691e-05, + "loss": 0.003, + "num_tokens": 16642320.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 327.625, + "completions/mean_terminated_length": 327.625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.36838221730308063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.09358723415061831, + "learning_rate": 1.9572872295692643e-05, + "loss": 0.0037, + "num_tokens": 16651141.0, + "reward": 1.84375, + "reward_std": 0.35197147727012634, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.3685666851134477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.05443906970322132, + "learning_rate": 1.9571940820836638e-05, + "loss": 0.0022, + "num_tokens": 16655628.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 268.375, + "completions/mean_terminated_length": 268.375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3687511529238148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.06278847018256783, + "learning_rate": 1.9571008353625466e-05, + "loss": 0.0025, + "num_tokens": 16661999.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 453.375, + "completions/mean_terminated_length": 453.375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.3689356207341819, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.03801625268533826, + "learning_rate": 1.9570074894155792e-05, + "loss": 0.0015, + "num_tokens": 16671474.0, + "reward": 1.5130208730697632, + "reward_std": 0.4067171812057495, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6380208730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.257799357175827, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 488.125, + "completions/mean_terminated_length": 488.125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.369120088544549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0198974609375, + "kl": 0.007652744097867981, + "learning_rate": 1.9569140442524396e-05, + "loss": 0.0003, + "num_tokens": 16678843.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 322.125, + "completions/mean_terminated_length": 322.125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.36930455635491605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.05371675547212362, + "learning_rate": 1.9568204998828157e-05, + "loss": 0.0021, + "num_tokens": 16685948.0, + "reward": 1.4153225421905518, + "reward_std": 0.4841589331626892, + "rewards/fixed_code_pass_all_test_reward/mean": 0.41532257199287415, + "rewards/fixed_code_pass_all_test_reward/std": 0.4841589033603668, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 579.375, + "completions/mean_terminated_length": 579.375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.3694890241652832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41796875, + "kl": 0.022888626786880195, + "learning_rate": 1.9567268563164052e-05, + "loss": 0.0009, + "num_tokens": 16698271.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 399.0, + "completions/mean_terminated_length": 399.0, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.36967349197565025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.044051578268408775, + "learning_rate": 1.9566331135629166e-05, + "loss": 0.0018, + "num_tokens": 16710639.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 582.25, + "completions/mean_terminated_length": 582.25, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.3698579597860173, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.017927406821399927, + "learning_rate": 1.9565392716320685e-05, + "loss": 0.0007, + "num_tokens": 16720793.0, + "reward": 1.8928570747375488, + "reward_std": 0.19839003682136536, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.19839002192020416, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 242.875, + "completions/mean_terminated_length": 242.875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.37004242759638445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.07648348552174866, + "learning_rate": 1.9564453305335896e-05, + "loss": 0.0031, + "num_tokens": 16730624.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 250.625, + "completions/mean_terminated_length": 250.625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3702268954067515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.04902959894388914, + "learning_rate": 1.9563512902772197e-05, + "loss": 0.002, + "num_tokens": 16738541.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3704113632171186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.03605345683172345, + "learning_rate": 1.956257150872708e-05, + "loss": 0.0014, + "num_tokens": 16747942.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 292.375, + "completions/mean_terminated_length": 292.375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.3705958310274857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.059115244541317225, + "learning_rate": 1.9561629123298133e-05, + "loss": 0.0024, + "num_tokens": 16756905.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 247.125, + "completions/mean_terminated_length": 247.125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3707802988378528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.07784890243783593, + "learning_rate": 1.956068574658307e-05, + "loss": 0.0031, + "num_tokens": 16764890.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 263.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.37096476664821987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.03753680735826492, + "learning_rate": 1.9559741378679686e-05, + "loss": 0.0015, + "num_tokens": 16773230.0, + "reward": 1.7857142686843872, + "reward_std": 0.18177399039268494, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.18177400529384613, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 254.375, + "completions/mean_terminated_length": 254.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.371149234458587, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.05322813929524273, + "learning_rate": 1.955879601968589e-05, + "loss": 0.0021, + "num_tokens": 16783281.0, + "reward": 1.0480769872665405, + "reward_std": 0.5363971590995789, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17307692766189575, + "rewards/fixed_code_pass_all_test_reward/std": 0.33655810356140137, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.37133370226895407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.031371780671179295, + "learning_rate": 1.9557849669699693e-05, + "loss": 0.0013, + "num_tokens": 16789243.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 224.75, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.37151817007932114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.10677461046725512, + "learning_rate": 1.9556902328819204e-05, + "loss": 0.0043, + "num_tokens": 16798713.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.37170263788968827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.193359375, + "kl": 0.03444643720285967, + "learning_rate": 1.955595399714263e-05, + "loss": 0.0014, + "num_tokens": 16803409.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 319.625, + "completions/mean_terminated_length": 319.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.37188710570005534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.026809882023371756, + "learning_rate": 1.9555004674768295e-05, + "loss": 0.0011, + "num_tokens": 16810430.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 300.375, + "completions/mean_terminated_length": 300.375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.3720715735104224, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.07040323503315449, + "learning_rate": 1.955405436179462e-05, + "loss": 0.0028, + "num_tokens": 16817065.0, + "reward": 1.100000023841858, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 342.0, + "completions/mean_terminated_length": 342.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.37225604132078954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.04891491308808327, + "learning_rate": 1.955310305832012e-05, + "loss": 0.002, + "num_tokens": 16824465.0, + "reward": 1.6041667461395264, + "reward_std": 0.3204349875450134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.3204349875450134, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 348.5, + "completions/mean_terminated_length": 348.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.3724405091311566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.04219663317780942, + "learning_rate": 1.955215076444343e-05, + "loss": 0.0017, + "num_tokens": 16833437.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 275.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.3726249769415237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.027353938901796937, + "learning_rate": 1.9551197480263268e-05, + "loss": 0.0011, + "num_tokens": 16839454.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 459.5, + "completions/mean_terminated_length": 459.5, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.3728094447518908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.046773074893280864, + "learning_rate": 1.955024320587847e-05, + "loss": 0.0019, + "num_tokens": 16848418.0, + "reward": 1.4861111640930176, + "reward_std": 0.4966821074485779, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6111111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.4824979305267334, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 379.75, + "completions/mean_terminated_length": 379.75, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.3729939125622579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.07454425725154579, + "learning_rate": 1.9549287941387966e-05, + "loss": 0.003, + "num_tokens": 16858448.0, + "reward": 1.0625, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 353.375, + "completions/mean_terminated_length": 353.375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.37317838037262496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71484375, + "kl": 0.03552156710065901, + "learning_rate": 1.9548331686890793e-05, + "loss": 0.0014, + "num_tokens": 16868427.0, + "reward": 1.953125, + "reward_std": 0.13258251547813416, + "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.13258251547813416, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 252.875, + "completions/mean_terminated_length": 252.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.3733628481829921, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.06974249822087586, + "learning_rate": 1.9547374442486086e-05, + "loss": 0.0028, + "num_tokens": 16873514.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.37354731599335916, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.060181002132594585, + "learning_rate": 1.9546416208273086e-05, + "loss": 0.0024, + "num_tokens": 16883558.0, + "reward": 1.3611111640930176, + "reward_std": 0.07856737822294235, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3611111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.07856743037700653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 210.375, + "completions/mean_terminated_length": 210.375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.37373178380372624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.03448632266372442, + "learning_rate": 1.954545698435114e-05, + "loss": 0.0014, + "num_tokens": 16888217.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 398.875, + "completions/mean_terminated_length": 398.875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.37391625161409336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.06144628161564469, + "learning_rate": 1.9544496770819692e-05, + "loss": 0.0025, + "num_tokens": 16900648.0, + "reward": 1.6931817531585693, + "reward_std": 0.33291494846343994, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6931818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.33291494846343994, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 300.0, + "completions/mean_terminated_length": 300.0, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.37410071942446044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.06268365075811744, + "learning_rate": 1.9543535567778293e-05, + "loss": 0.0025, + "num_tokens": 16907152.0, + "reward": 1.6691176891326904, + "reward_std": 0.35004061460494995, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7941176891326904, + "rewards/fixed_code_pass_all_test_reward/std": 0.04159451276063919, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 372.625, + "completions/mean_terminated_length": 372.625, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.3742851872348275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.042413814924657345, + "learning_rate": 1.9542573375326592e-05, + "loss": 0.0017, + "num_tokens": 16915013.0, + "reward": 1.6363636255264282, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6363636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 223.875, + "completions/mean_terminated_length": 223.875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.37446965504519464, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.050219545140862465, + "learning_rate": 1.954161019356434e-05, + "loss": 0.002, + "num_tokens": 16919804.0, + "reward": 0.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 157.75, + "completions/mean_terminated_length": 157.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3746541228555617, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9921875, + "kl": 0.04159138537943363, + "learning_rate": 1.95406460225914e-05, + "loss": 0.0017, + "num_tokens": 16923978.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 186.0, + "completions/mean_terminated_length": 186.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.3748385906659288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.049045275431126356, + "learning_rate": 1.953968086250772e-05, + "loss": 0.002, + "num_tokens": 16928346.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 268.375, + "completions/mean_terminated_length": 268.375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.3750230584762959, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.12399580841884017, + "learning_rate": 1.9538714713413373e-05, + "loss": 0.005, + "num_tokens": 16938781.0, + "reward": 1.4930555820465088, + "reward_std": 0.2438620775938034, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4930555820465088, + "rewards/fixed_code_pass_all_test_reward/std": 0.2438620626926422, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 217.5, + "completions/mean_terminated_length": 217.5, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.375207526286663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.05269163614138961, + "learning_rate": 1.9537747575408515e-05, + "loss": 0.0021, + "num_tokens": 16948673.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 164.25, + "completions/mean_terminated_length": 164.25, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.37539199409703006, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.044148302054964006, + "learning_rate": 1.9536779448593418e-05, + "loss": 0.0018, + "num_tokens": 16952819.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 294.625, + "completions/mean_terminated_length": 294.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.3755764619073972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.03891810623463243, + "learning_rate": 1.953581033306845e-05, + "loss": 0.0016, + "num_tokens": 16959408.0, + "reward": 1.9305555820465088, + "reward_std": 0.19641853868961334, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9305555820465088, + "rewards/fixed_code_pass_all_test_reward/std": 0.19641855359077454, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 359.125, + "completions/mean_terminated_length": 359.125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.37576092971776426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.04182920535095036, + "learning_rate": 1.9534840228934077e-05, + "loss": 0.0017, + "num_tokens": 16967001.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.37594539752813133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.026598265510983765, + "learning_rate": 1.9533869136290882e-05, + "loss": 0.0011, + "num_tokens": 16973295.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 211.75, + "completions/mean_terminated_length": 211.75, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.37612986533849846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.03392723994329572, + "learning_rate": 1.953289705523953e-05, + "loss": 0.0014, + "num_tokens": 16977837.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 159.0, + "completions/mean_terminated_length": 159.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.37631433314886553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.055806771386414766, + "learning_rate": 1.953192398588081e-05, + "loss": 0.0022, + "num_tokens": 16981845.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.3764988009592326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.05344292731024325, + "learning_rate": 1.9530949928315598e-05, + "loss": 0.0021, + "num_tokens": 16990914.0, + "reward": 1.8624999523162842, + "reward_std": 0.11386082321405411, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8624999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.1138608381152153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 299.25, + "completions/mean_terminated_length": 299.25, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.3766832687695997, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.06335927359759808, + "learning_rate": 1.9529974882644882e-05, + "loss": 0.0025, + "num_tokens": 16997436.0, + "reward": 1.798076868057251, + "reward_std": 0.29056012630462646, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7980769276618958, + "rewards/fixed_code_pass_all_test_reward/std": 0.29056015610694885, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.3768677365799668, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.0410170522518456, + "learning_rate": 1.9528998848969746e-05, + "loss": 0.0016, + "num_tokens": 17008146.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 417.5, + "completions/mean_terminated_length": 417.5, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.3770522043903339, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.03826856892555952, + "learning_rate": 1.9528021827391376e-05, + "loss": 0.0015, + "num_tokens": 17016030.0, + "reward": 1.8081395626068115, + "reward_std": 0.245723694562912, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8081395626068115, + "rewards/fixed_code_pass_all_test_reward/std": 0.245723694562912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 478.125, + "completions/mean_terminated_length": 478.125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.37723667220070095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6953125, + "kl": 0.03932886617258191, + "learning_rate": 1.9527043818011063e-05, + "loss": 0.0016, + "num_tokens": 17026671.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 234.375, + "completions/mean_terminated_length": 234.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3774211400110681, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.04826147947460413, + "learning_rate": 1.9526064820930205e-05, + "loss": 0.0019, + "num_tokens": 17036082.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 581.75, + "completions/mean_terminated_length": 581.75, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.37760560782143515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.031019500456750393, + "learning_rate": 1.9525084836250296e-05, + "loss": 0.0012, + "num_tokens": 17051008.0, + "reward": 1.6083333492279053, + "reward_std": 0.39107102155685425, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6083333492279053, + "rewards/fixed_code_pass_all_test_reward/std": 0.39107099175453186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 198.625, + "completions/mean_terminated_length": 198.625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.3777900756318022, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.05638855532743037, + "learning_rate": 1.9524103864072933e-05, + "loss": 0.0023, + "num_tokens": 17055485.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 315.5, + "completions/mean_terminated_length": 315.5, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.37797454344216935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.020734177203848958, + "learning_rate": 1.952312190449982e-05, + "loss": 0.0008, + "num_tokens": 17061617.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 359.75, + "completions/mean_terminated_length": 359.75, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.3781590112525364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.04868621751666069, + "learning_rate": 1.9522138957632758e-05, + "loss": 0.0019, + "num_tokens": 17071583.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 563.625, + "completions/mean_terminated_length": 563.625, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.3783434790629035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.03754097945056856, + "learning_rate": 1.9521155023573648e-05, + "loss": 0.0015, + "num_tokens": 17087164.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 521.125, + "completions/mean_terminated_length": 521.125, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.3785279468732706, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.04583070264197886, + "learning_rate": 1.9520170102424506e-05, + "loss": 0.0018, + "num_tokens": 17096629.0, + "reward": 0.3583333492279053, + "reward_std": 0.6826070547103882, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10833333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.2568119466304779, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.3787124146836377, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.04443188733421266, + "learning_rate": 1.9519184194287434e-05, + "loss": 0.0018, + "num_tokens": 17102482.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 332.875, + "completions/mean_terminated_length": 332.875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.37889688249400477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.03437469992786646, + "learning_rate": 1.9518197299264652e-05, + "loss": 0.0014, + "num_tokens": 17111489.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 599.125, + "completions/mean_terminated_length": 599.125, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.3790813503043719, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6953125, + "kl": 0.04572005337104201, + "learning_rate": 1.951720941745847e-05, + "loss": 0.0018, + "num_tokens": 17124202.0, + "reward": 1.1666667461395264, + "reward_std": 0.3421454131603241, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.3421454429626465, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 340.0, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.379265818114739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.09020740818232298, + "learning_rate": 1.951622054897131e-05, + "loss": 0.0036, + "num_tokens": 17134042.0, + "reward": 1.5297619104385376, + "reward_std": 0.2607426047325134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5297619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.2607426345348358, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 296.875, + "completions/mean_terminated_length": 296.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.37945028592510605, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.07114747120067477, + "learning_rate": 1.9515230693905682e-05, + "loss": 0.0028, + "num_tokens": 17142521.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 392.0, + "completions/mean_terminated_length": 392.0, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.3796347537354732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.07602614630013704, + "learning_rate": 1.9514239852364214e-05, + "loss": 0.003, + "num_tokens": 17152953.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 241.75, + "completions/mean_terminated_length": 241.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.37981922154584025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.05567008024081588, + "learning_rate": 1.951324802444963e-05, + "loss": 0.0022, + "num_tokens": 17157671.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 369.5, + "completions/mean_terminated_length": 369.5, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.3800036893562073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.045564227970317006, + "learning_rate": 1.9512255210264755e-05, + "loss": 0.0018, + "num_tokens": 17164819.0, + "reward": 1.509615421295166, + "reward_std": 0.47407424449920654, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5096153616905212, + "rewards/fixed_code_pass_all_test_reward/std": 0.47407427430152893, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 511.625, + "completions/mean_terminated_length": 511.625, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.38018815716657445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.046960206469520926, + "learning_rate": 1.9511261409912515e-05, + "loss": 0.0019, + "num_tokens": 17174152.0, + "reward": 1.5546875, + "reward_std": 0.3230726718902588, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5546875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3230726420879364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 468.5, + "completions/mean_terminated_length": 468.5, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.3803726249769415, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.05107648251578212, + "learning_rate": 1.9510266623495947e-05, + "loss": 0.002, + "num_tokens": 17185076.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 490.75, + "completions/mean_terminated_length": 490.75, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.3805570927873086, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.05041854712180793, + "learning_rate": 1.9509270851118175e-05, + "loss": 0.002, + "num_tokens": 17195266.0, + "reward": 1.6607142686843872, + "reward_std": 0.716069221496582, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 557.75, + "completions/mean_terminated_length": 557.75, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.3807415605976757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.07225440349429846, + "learning_rate": 1.950827409288244e-05, + "loss": 0.0029, + "num_tokens": 17208736.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 548.125, + "completions/mean_terminated_length": 548.125, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.3809260284080428, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.07840250292792916, + "learning_rate": 1.9507276348892085e-05, + "loss": 0.0031, + "num_tokens": 17222073.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 337.75, + "completions/mean_terminated_length": 337.75, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.38111049621840987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.07094631530344486, + "learning_rate": 1.9506277619250535e-05, + "loss": 0.0028, + "num_tokens": 17231919.0, + "reward": 1.524193525314331, + "reward_std": 0.4344158172607422, + "rewards/fixed_code_pass_all_test_reward/mean": 0.524193525314331, + "rewards/fixed_code_pass_all_test_reward/std": 0.4344158172607422, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 296.5, + "completions/mean_terminated_length": 296.5, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.381294964028777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.030968716484494507, + "learning_rate": 1.9505277904061343e-05, + "loss": 0.0012, + "num_tokens": 17238643.0, + "reward": 1.8916666507720947, + "reward_std": 0.3064129650592804, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8916666507720947, + "rewards/fixed_code_pass_all_test_reward/std": 0.306412935256958, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 293.0, + "completions/mean_terminated_length": 293.0, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.38147943183914407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.05680610775016248, + "learning_rate": 1.9504277203428148e-05, + "loss": 0.0023, + "num_tokens": 17248619.0, + "reward": 1.8571429252624512, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.3499270975589752, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.38166389964951114, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.09418758423998952, + "learning_rate": 1.9503275517454696e-05, + "loss": 0.0038, + "num_tokens": 17252660.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 320.75, + "completions/mean_terminated_length": 320.75, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.38184836745987827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.02664883271791041, + "learning_rate": 1.9502272846244835e-05, + "loss": 0.0011, + "num_tokens": 17259290.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 640.75, + "completions/mean_terminated_length": 171.6666717529297, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.38203283527024534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.053099905722774565, + "learning_rate": 1.950126918990252e-05, + "loss": 0.0021, + "num_tokens": 17267288.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 282.875, + "completions/mean_terminated_length": 282.875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.3822173030806124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.04842101130634546, + "learning_rate": 1.95002645485318e-05, + "loss": 0.0019, + "num_tokens": 17276767.0, + "reward": 1.5, + "reward_std": 0.26726123690605164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 329.0, + "completions/mean_terminated_length": 329.0, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.38240177089097954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.05296260421164334, + "learning_rate": 1.9499258922236825e-05, + "loss": 0.0021, + "num_tokens": 17286327.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 1190.0, + "completions/mean_length": 459.0, + "completions/mean_terminated_length": 459.0, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.3825862387013466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.029027825919911265, + "learning_rate": 1.949825231112186e-05, + "loss": 0.0012, + "num_tokens": 17296679.0, + "reward": 1.8645833730697632, + "reward_std": 0.25074291229248047, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.25074294209480286, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 375.25, + "completions/mean_terminated_length": 375.25, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.3827707065117137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.06703383522108197, + "learning_rate": 1.949724471529126e-05, + "loss": 0.0027, + "num_tokens": 17304273.0, + "reward": 1.375, + "reward_std": 0.13363061845302582, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.13363061845302582, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 433.625, + "completions/mean_terminated_length": 433.625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.3829551743220808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.060566093772649765, + "learning_rate": 1.9496236134849485e-05, + "loss": 0.0024, + "num_tokens": 17312598.0, + "reward": 1.6041666269302368, + "reward_std": 0.4266657531261444, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6041666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.426665723323822, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3831396421324479, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.06383937736973166, + "learning_rate": 1.9495226569901098e-05, + "loss": 0.0026, + "num_tokens": 17323864.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 186.625, + "completions/mean_terminated_length": 186.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.38332410994281496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.04194837110117078, + "learning_rate": 1.9494216020550765e-05, + "loss": 0.0017, + "num_tokens": 17328165.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.3835085777531821, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.04810000048018992, + "learning_rate": 1.9493204486903252e-05, + "loss": 0.0019, + "num_tokens": 17336731.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 249.625, + "completions/mean_terminated_length": 249.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.38369304556354916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.0708970776759088, + "learning_rate": 1.9492191969063427e-05, + "loss": 0.0028, + "num_tokens": 17344352.0, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 336.0, + "completions/mean_terminated_length": 336.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.38387751337391623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.050205412320792675, + "learning_rate": 1.9491178467136265e-05, + "loss": 0.002, + "num_tokens": 17353952.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.38406198118428336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.06541570695117116, + "learning_rate": 1.9490163981226836e-05, + "loss": 0.0026, + "num_tokens": 17361737.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 239.625, + "completions/mean_terminated_length": 239.625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.38424644899465044, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.05118050426244736, + "learning_rate": 1.948914851144032e-05, + "loss": 0.002, + "num_tokens": 17370454.0, + "reward": 1.9318182468414307, + "reward_std": 0.09409989416599274, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181276321411, + "rewards/fixed_code_pass_all_test_reward/std": 0.09409984946250916, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 311.625, + "completions/mean_terminated_length": 311.625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3844309168050175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.0463952929712832, + "learning_rate": 1.9488132057881984e-05, + "loss": 0.0019, + "num_tokens": 17377419.0, + "reward": 1.2959184646606445, + "reward_std": 0.3390458822250366, + "rewards/fixed_code_pass_all_test_reward/mean": 0.42091837525367737, + "rewards/fixed_code_pass_all_test_reward/std": 0.06164819374680519, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.38461538461538464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.03915958805009723, + "learning_rate": 1.9487114620657216e-05, + "loss": 0.0016, + "num_tokens": 17382205.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.3847998524257517, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.06372892530634999, + "learning_rate": 1.9486096199871497e-05, + "loss": 0.0025, + "num_tokens": 17391594.0, + "reward": 1.798387050628662, + "reward_std": 0.3733145296573639, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7983871102333069, + "rewards/fixed_code_pass_all_test_reward/std": 0.3733145594596863, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 383.75, + "completions/mean_terminated_length": 383.75, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.3849843202361188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2060546875, + "kl": 0.05129766184836626, + "learning_rate": 1.9485076795630406e-05, + "loss": 0.0021, + "num_tokens": 17399824.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 325.25, + "completions/mean_terminated_length": 325.25, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.3851687880464859, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.02355732163414359, + "learning_rate": 1.948405640803963e-05, + "loss": 0.0009, + "num_tokens": 17409866.0, + "reward": 1.46875, + "reward_std": 0.4712729752063751, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 262.5, + "completions/mean_terminated_length": 262.5, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.385353255856853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.025293090206105262, + "learning_rate": 1.9483035037204955e-05, + "loss": 0.001, + "num_tokens": 17415406.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 292.0, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.38553772366722006, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.03329848707653582, + "learning_rate": 1.948201268323227e-05, + "loss": 0.0013, + "num_tokens": 17420518.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.3857221914775872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.05645864538382739, + "learning_rate": 1.9480989346227565e-05, + "loss": 0.0023, + "num_tokens": 17431089.0, + "reward": 1.7620967626571655, + "reward_std": 0.44126981496810913, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7620967626571655, + "rewards/fixed_code_pass_all_test_reward/std": 0.44126981496810913, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 540.0, + "completions/mean_terminated_length": 540.0, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.38590665928795426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.03197801043279469, + "learning_rate": 1.9479965026296938e-05, + "loss": 0.0013, + "num_tokens": 17441361.0, + "reward": 1.6510417461395264, + "reward_std": 0.22097088396549225, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6510416865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.38609112709832133, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.02311966847628355, + "learning_rate": 1.947893972354658e-05, + "loss": 0.0009, + "num_tokens": 17446990.0, + "reward": 1.78125, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 302.75, + "completions/mean_terminated_length": 302.75, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.38627559490868846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.03848932380788028, + "learning_rate": 1.9477913438082785e-05, + "loss": 0.0015, + "num_tokens": 17456292.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.38646006271905553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.09161822684109211, + "learning_rate": 1.9476886170011955e-05, + "loss": 0.0037, + "num_tokens": 17465129.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 400.625, + "completions/mean_terminated_length": 400.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.3866445305294226, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.03116665210109204, + "learning_rate": 1.947585791944059e-05, + "loss": 0.0012, + "num_tokens": 17473950.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.38682899833978973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.0637482744641602, + "learning_rate": 1.9474828686475286e-05, + "loss": 0.0025, + "num_tokens": 17479189.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 121.25, + "completions/mean_terminated_length": 121.25, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3870134661501568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12109375, + "kl": 0.04604291799478233, + "learning_rate": 1.947379847122276e-05, + "loss": 0.0018, + "num_tokens": 17482879.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 254.375, + "completions/mean_terminated_length": 254.375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3871979339605239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.06750999251380563, + "learning_rate": 1.9472767273789804e-05, + "loss": 0.0027, + "num_tokens": 17491914.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 125.75, + "completions/mean_terminated_length": 125.75, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.387382401770891, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.07673660106956959, + "learning_rate": 1.9471735094283337e-05, + "loss": 0.0031, + "num_tokens": 17495752.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 248.375, + "completions/mean_terminated_length": 248.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.3875668695812581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.03214054787531495, + "learning_rate": 1.9470701932810364e-05, + "loss": 0.0013, + "num_tokens": 17500995.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 274.0, + "completions/mean_terminated_length": 274.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.38775133739162515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.058309352956712246, + "learning_rate": 1.946966778947799e-05, + "loss": 0.0023, + "num_tokens": 17507595.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 185.25, + "completions/mean_terminated_length": 185.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.3879358052019923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.11231824476271868, + "learning_rate": 1.946863266439344e-05, + "loss": 0.0045, + "num_tokens": 17514197.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.38812027301235935, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.0626998080406338, + "learning_rate": 1.9467596557664018e-05, + "loss": 0.0025, + "num_tokens": 17522012.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.3883047408227264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.185546875, + "kl": 0.06708673760294914, + "learning_rate": 1.946655946939715e-05, + "loss": 0.0027, + "num_tokens": 17531029.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 240.625, + "completions/mean_terminated_length": 240.625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.38848920863309355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.08421134203672409, + "learning_rate": 1.9465521399700346e-05, + "loss": 0.0034, + "num_tokens": 17538282.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 228.25, + "completions/mean_terminated_length": 228.25, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.3886736764434606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.07015104405581951, + "learning_rate": 1.9464482348681232e-05, + "loss": 0.0028, + "num_tokens": 17543132.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3888581442538277, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.056338427821174264, + "learning_rate": 1.9463442316447527e-05, + "loss": 0.0023, + "num_tokens": 17547753.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 451.75, + "completions/mean_terminated_length": 451.75, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.38904261206419477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69140625, + "kl": 0.043425839161500335, + "learning_rate": 1.946240130310706e-05, + "loss": 0.0017, + "num_tokens": 17561623.0, + "reward": 1.0204918384552002, + "reward_std": 0.057959601283073425, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02049180306494236, + "rewards/fixed_code_pass_all_test_reward/std": 0.05795957148075104, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 244.125, + "completions/mean_terminated_length": 244.125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.3892270798745619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.0684242770075798, + "learning_rate": 1.9461359308767745e-05, + "loss": 0.0027, + "num_tokens": 17570264.0, + "reward": 1.807692289352417, + "reward_std": 0.376844584941864, + "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.19037489593029022, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.38941154768492897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.06828240863978863, + "learning_rate": 1.946031633353762e-05, + "loss": 0.0027, + "num_tokens": 17580990.0, + "reward": 1.8729166984558105, + "reward_std": 0.14084140956401825, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8729166984558105, + "rewards/fixed_code_pass_all_test_reward/std": 0.14084143936634064, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 348.0, + "completions/mean_terminated_length": 348.0, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.38959601549529604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306640625, + "kl": 0.029277416004333645, + "learning_rate": 1.945927237752481e-05, + "loss": 0.0012, + "num_tokens": 17587582.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 380.75, + "completions/mean_terminated_length": 380.75, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.3897804833056632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.05177688947878778, + "learning_rate": 1.9458227440837545e-05, + "loss": 0.0021, + "num_tokens": 17597092.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.38996495111603025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.053191795479506254, + "learning_rate": 1.9457181523584155e-05, + "loss": 0.0021, + "num_tokens": 17602000.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 351.0, + "completions/mean_terminated_length": 351.0, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.3901494189263973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035888671875, + "kl": 0.02781262795906514, + "learning_rate": 1.9456134625873076e-05, + "loss": 0.0011, + "num_tokens": 17609856.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.39033388673676445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.07971406308934093, + "learning_rate": 1.9455086747812847e-05, + "loss": 0.0032, + "num_tokens": 17614141.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 168.25, + "completions/mean_terminated_length": 168.25, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.3905183545471315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051025390625, + "kl": 0.04130468424409628, + "learning_rate": 1.94540378895121e-05, + "loss": 0.0017, + "num_tokens": 17622263.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.3907028223574986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.04059723252430558, + "learning_rate": 1.9452988051079572e-05, + "loss": 0.0016, + "num_tokens": 17627761.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 192.875, + "completions/mean_terminated_length": 192.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.3908872901678657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.05294724949635565, + "learning_rate": 1.945193723262411e-05, + "loss": 0.0021, + "num_tokens": 17632040.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 262.25, + "completions/mean_terminated_length": 262.25, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.3910717579782328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.08972419775091112, + "learning_rate": 1.945088543425465e-05, + "loss": 0.0036, + "num_tokens": 17640754.0, + "reward": 1.8863636255264282, + "reward_std": 0.32141217589378357, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8863636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.32141217589378357, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 346.625, + "completions/mean_terminated_length": 346.625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.39125622578859987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04183103935793042, + "learning_rate": 1.9449832656080237e-05, + "loss": 0.0017, + "num_tokens": 17649471.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 277.375, + "completions/mean_terminated_length": 277.375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.391440693598967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.05508595984429121, + "learning_rate": 1.944877889821002e-05, + "loss": 0.0022, + "num_tokens": 17655890.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 233.875, + "completions/mean_terminated_length": 233.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.39162516140933407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.04842102574184537, + "learning_rate": 1.9447724160753242e-05, + "loss": 0.0019, + "num_tokens": 17660665.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 250.75, + "completions/mean_terminated_length": 250.75, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.39180962921970114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.042912264936603606, + "learning_rate": 1.9446668443819247e-05, + "loss": 0.0017, + "num_tokens": 17669615.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 348.75, + "completions/mean_terminated_length": 348.75, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.39199409703006827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.05266399076208472, + "learning_rate": 1.944561174751749e-05, + "loss": 0.0021, + "num_tokens": 17680661.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 401.75, + "completions/mean_terminated_length": 401.75, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.39217856484043534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.028338501462712884, + "learning_rate": 1.9444554071957523e-05, + "loss": 0.0011, + "num_tokens": 17690715.0, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 124.375, + "completions/mean_terminated_length": 124.375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.3923630326508024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.0769399469718337, + "learning_rate": 1.9443495417249e-05, + "loss": 0.0031, + "num_tokens": 17694534.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 485.0, + "completions/mean_terminated_length": 261.71429443359375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.39254750046116954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.61328125, + "kl": 0.04944518709089607, + "learning_rate": 1.944243578350167e-05, + "loss": 0.002, + "num_tokens": 17703638.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 340.125, + "completions/mean_terminated_length": 340.125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.3927319682715366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.06131342728622258, + "learning_rate": 1.944137517082539e-05, + "loss": 0.0025, + "num_tokens": 17711087.0, + "reward": 1.9500000476837158, + "reward_std": 0.09258202463388443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 259.25, + "completions/mean_terminated_length": 259.25, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.3929164360819037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.04012003121897578, + "learning_rate": 1.9440313579330122e-05, + "loss": 0.0016, + "num_tokens": 17716545.0, + "reward": 1.8249999284744263, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8250000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 436.875, + "completions/mean_terminated_length": 436.875, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.3931009038922708, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.0478950230171904, + "learning_rate": 1.9439251009125917e-05, + "loss": 0.0019, + "num_tokens": 17727456.0, + "reward": 1.9943182468414307, + "reward_std": 0.0035068909637629986, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9943181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.0035068909637629986, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 295.0, + "completions/mean_terminated_length": 295.0, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.3932853717026379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.05803209752775729, + "learning_rate": 1.9438187460322943e-05, + "loss": 0.0023, + "num_tokens": 17736824.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.39346983951300496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.023261202848516405, + "learning_rate": 1.9437122933031454e-05, + "loss": 0.0009, + "num_tokens": 17744668.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.3936543073233721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.02751844155136496, + "learning_rate": 1.943605742736182e-05, + "loss": 0.0011, + "num_tokens": 17749021.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 128.0, + "completions/mean_terminated_length": 128.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.39383877513373916, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.46875, + "kl": 0.286012418102473, + "learning_rate": 1.9434990943424503e-05, + "loss": 0.0114, + "num_tokens": 17752765.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 330.625, + "completions/mean_terminated_length": 330.625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.39402324294410623, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0361328125, + "kl": 0.023990232264623046, + "learning_rate": 1.9433923481330067e-05, + "loss": 0.001, + "num_tokens": 17759362.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 208.875, + "completions/mean_terminated_length": 208.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.39420771075447336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.052420936757698655, + "learning_rate": 1.943285504118918e-05, + "loss": 0.0021, + "num_tokens": 17768801.0, + "reward": 1.6671512126922607, + "reward_std": 0.41614899039268494, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6671512126922607, + "rewards/fixed_code_pass_all_test_reward/std": 0.4161490201950073, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 298.0, + "completions/mean_terminated_length": 298.0, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.39439217856484043, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04765080125071108, + "learning_rate": 1.9431785623112616e-05, + "loss": 0.0019, + "num_tokens": 17775321.0, + "reward": 1.884615421295166, + "reward_std": 0.21365076303482056, + "rewards/fixed_code_pass_all_test_reward/mean": 0.884615421295166, + "rewards/fixed_code_pass_all_test_reward/std": 0.21365077793598175, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 470.0, + "completions/mean_terminated_length": 470.0, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.3945766463752075, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.03481989342253655, + "learning_rate": 1.943071522721124e-05, + "loss": 0.0014, + "num_tokens": 17790289.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 219.25, + "completions/mean_terminated_length": 219.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.39476111418557464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.06890724692493677, + "learning_rate": 1.9429643853596025e-05, + "loss": 0.0028, + "num_tokens": 17799171.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 451.625, + "completions/mean_terminated_length": 451.625, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.3949455819959417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.046528360806405544, + "learning_rate": 1.9428571502378043e-05, + "loss": 0.0019, + "num_tokens": 17808088.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3951300498063088, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.05304823839105666, + "learning_rate": 1.9427498173668467e-05, + "loss": 0.0021, + "num_tokens": 17812357.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 447.625, + "completions/mean_terminated_length": 447.625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.3953145176166759, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.044579875422641635, + "learning_rate": 1.9426423867578576e-05, + "loss": 0.0018, + "num_tokens": 17822018.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 733.125, + "completions/mean_terminated_length": 733.125, + "completions/min_length": 647.0, + "completions/min_terminated_length": 647.0, + "epoch": 0.395498985427043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "kl": 0.02649968327023089, + "learning_rate": 1.9425348584219745e-05, + "loss": 0.0011, + "num_tokens": 17835291.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.39568345323741005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.05940220458433032, + "learning_rate": 1.9424272323703453e-05, + "loss": 0.0024, + "num_tokens": 17844879.0, + "reward": 1.615384578704834, + "reward_std": 0.4111711084842682, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6153846383094788, + "rewards/fixed_code_pass_all_test_reward/std": 0.41117116808891296, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 335.0, + "completions/mean_terminated_length": 335.0, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.3958679210477772, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.060845807660371065, + "learning_rate": 1.942319508614128e-05, + "loss": 0.0024, + "num_tokens": 17854407.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 281.875, + "completions/mean_terminated_length": 281.875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.39605238885814426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.05791920842602849, + "learning_rate": 1.94221168716449e-05, + "loss": 0.0023, + "num_tokens": 17864494.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 109.0, + "completions/max_terminated_length": 109.0, + "completions/mean_length": 103.375, + "completions/mean_terminated_length": 103.375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.39623685666851133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.05569723201915622, + "learning_rate": 1.9421037680326106e-05, + "loss": 0.0022, + "num_tokens": 17868057.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 323.375, + "completions/mean_terminated_length": 323.375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.39642132447887846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.03214497282169759, + "learning_rate": 1.9419957512296775e-05, + "loss": 0.0013, + "num_tokens": 17874348.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 261.875, + "completions/mean_terminated_length": 261.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.39660579228924553, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.049673862056806684, + "learning_rate": 1.941887636766889e-05, + "loss": 0.002, + "num_tokens": 17883403.0, + "reward": 1.6331522464752197, + "reward_std": 0.30377885699272156, + "rewards/fixed_code_pass_all_test_reward/mean": 0.633152186870575, + "rewards/fixed_code_pass_all_test_reward/std": 0.30377885699272156, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 283.5, + "completions/mean_terminated_length": 283.5, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.3967902600996126, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.08691768813878298, + "learning_rate": 1.9417794246554546e-05, + "loss": 0.0035, + "num_tokens": 17892903.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 449.125, + "completions/mean_terminated_length": 449.125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.39697472790997973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.04274927731603384, + "learning_rate": 1.941671114906592e-05, + "loss": 0.0017, + "num_tokens": 17901632.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 631.25, + "completions/mean_terminated_length": 631.25, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.3971591957203468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.05477748205885291, + "learning_rate": 1.9415627075315305e-05, + "loss": 0.0022, + "num_tokens": 17916698.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 131.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 115.875, + "completions/mean_terminated_length": 115.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.3973436635307139, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.21875, + "kl": 0.3450149307027459, + "learning_rate": 1.9414542025415088e-05, + "loss": 0.0138, + "num_tokens": 17920289.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 463.375, + "completions/mean_terminated_length": 463.375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.397528131341081, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05411734222434461, + "learning_rate": 1.9413455999477763e-05, + "loss": 0.0022, + "num_tokens": 17933716.0, + "reward": 1.59375, + "reward_std": 0.1735912710428238, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1735912710428238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 325.5, + "completions/mean_terminated_length": 325.5, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.3977125991514481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.039956387830898166, + "learning_rate": 1.941236899761592e-05, + "loss": 0.0016, + "num_tokens": 17943888.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 271.0, + "completions/mean_terminated_length": 271.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.39789706696181515, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.05863950354978442, + "learning_rate": 1.9411281019942255e-05, + "loss": 0.0023, + "num_tokens": 17949328.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 183.25, + "completions/mean_terminated_length": 183.25, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3980815347721823, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.06123341480270028, + "learning_rate": 1.941019206656956e-05, + "loss": 0.0024, + "num_tokens": 17953586.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.39826600258254935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1982421875, + "kl": 0.037422185007017106, + "learning_rate": 1.9409102137610727e-05, + "loss": 0.0015, + "num_tokens": 17958351.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.3984504703929164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.07626939087640494, + "learning_rate": 1.9408011233178756e-05, + "loss": 0.003, + "num_tokens": 17969920.0, + "reward": 1.2633495330810547, + "reward_std": 0.0034325113520026207, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2633495330810547, + "rewards/fixed_code_pass_all_test_reward/std": 0.0034325553569942713, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.39863493820328355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.11290344595909119, + "learning_rate": 1.9406919353386747e-05, + "loss": 0.0045, + "num_tokens": 17976100.0, + "reward": 1.783088207244873, + "reward_std": 0.32786908745765686, + "rewards/fixed_code_pass_all_test_reward/mean": 0.783088207244873, + "rewards/fixed_code_pass_all_test_reward/std": 0.32786908745765686, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 253.375, + "completions/mean_terminated_length": 253.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.3988194060136506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.03554335323860869, + "learning_rate": 1.94058264983479e-05, + "loss": 0.0014, + "num_tokens": 17983079.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 458.375, + "completions/mean_terminated_length": 458.375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.3990038738240177, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.059270055731758475, + "learning_rate": 1.9404732668175505e-05, + "loss": 0.0024, + "num_tokens": 17997618.0, + "reward": 1.4038461446762085, + "reward_std": 0.4307495951652527, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4038461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.4307496249675751, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 290.875, + "completions/mean_terminated_length": 290.875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.3991883416343848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.042807078221812844, + "learning_rate": 1.940363786298297e-05, + "loss": 0.0017, + "num_tokens": 18002897.0, + "reward": 1.375, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 222.75, + "completions/mean_terminated_length": 222.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3993728094447519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.03869446483440697, + "learning_rate": 1.94025420828838e-05, + "loss": 0.0015, + "num_tokens": 18007535.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 140.0, + "completions/mean_terminated_length": 140.0, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.39955727725511897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.07656186539679766, + "learning_rate": 1.940144532799159e-05, + "loss": 0.0031, + "num_tokens": 18011567.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 217.25, + "completions/mean_terminated_length": 217.25, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3997417450654861, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.05544002237729728, + "learning_rate": 1.9400347598420056e-05, + "loss": 0.0022, + "num_tokens": 18019529.0, + "reward": 1.9047619104385376, + "reward_std": 0.2693740129470825, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9047619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.2693740129470825, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 158.375, + "completions/mean_terminated_length": 158.375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.39992621287585317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.05693213013000786, + "learning_rate": 1.9399248894282993e-05, + "loss": 0.0023, + "num_tokens": 18023780.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 129.0, + "completions/mean_terminated_length": 129.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.40011068068622024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.271484375, + "kl": 0.10353199951350689, + "learning_rate": 1.9398149215694308e-05, + "loss": 0.0041, + "num_tokens": 18027636.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 253.25, + "completions/mean_terminated_length": 253.25, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.40029514849658737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.06699408125132322, + "learning_rate": 1.9397048562768015e-05, + "loss": 0.0027, + "num_tokens": 18036222.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 350.5, + "completions/mean_terminated_length": 350.5, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.40047961630695444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.03513389895670116, + "learning_rate": 1.939594693561822e-05, + "loss": 0.0014, + "num_tokens": 18046370.0, + "reward": 1.399999976158142, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4000000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 258.5, + "completions/mean_terminated_length": 258.5, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.4006640841173215, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.09294602856971323, + "learning_rate": 1.9394844334359124e-05, + "loss": 0.0037, + "num_tokens": 18054430.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 228.0, + "completions/mean_terminated_length": 228.0, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.40084855192768865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.06675314856693149, + "learning_rate": 1.939374075910505e-05, + "loss": 0.0027, + "num_tokens": 18062030.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 189.25, + "completions/mean_terminated_length": 189.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.4010330197380557, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.09125767834484577, + "learning_rate": 1.93926362099704e-05, + "loss": 0.0037, + "num_tokens": 18071312.0, + "reward": 1.191176414489746, + "reward_std": 0.4950321614742279, + "rewards/fixed_code_pass_all_test_reward/mean": 0.31617647409439087, + "rewards/fixed_code_pass_all_test_reward/std": 0.1723969727754593, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.4012174875484228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.03939844435080886, + "learning_rate": 1.9391530687069692e-05, + "loss": 0.0016, + "num_tokens": 18079908.0, + "reward": 1.899999976158142, + "reward_std": 0.10690455138683319, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.10690449178218842, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 302.875, + "completions/mean_terminated_length": 302.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.40140195535878986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.057252559112384915, + "learning_rate": 1.9390424190517536e-05, + "loss": 0.0023, + "num_tokens": 18090547.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 115.875, + "completions/mean_terminated_length": 115.875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.401586423169157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.031221885699778795, + "learning_rate": 1.938931672042865e-05, + "loss": 0.0012, + "num_tokens": 18094194.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 291.625, + "completions/mean_terminated_length": 291.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.40177089097952406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.0883671403862536, + "learning_rate": 1.9388208276917842e-05, + "loss": 0.0035, + "num_tokens": 18103119.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 345.375, + "completions/mean_terminated_length": 345.375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.40195535878989114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.06271557370200753, + "learning_rate": 1.9387098860100037e-05, + "loss": 0.0025, + "num_tokens": 18112074.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.40213982660025827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.05180928623303771, + "learning_rate": 1.9385988470090242e-05, + "loss": 0.0021, + "num_tokens": 18121036.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 742.0, + "completions/mean_terminated_length": 742.0, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.40232429441062534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.0375310453819111, + "learning_rate": 1.9384877107003587e-05, + "loss": 0.0015, + "num_tokens": 18140380.0, + "reward": 1.3125, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 181.25, + "completions/mean_terminated_length": 181.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4025087622209924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37890625, + "kl": 0.15197537187486887, + "learning_rate": 1.938376477095528e-05, + "loss": 0.0061, + "num_tokens": 18144886.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 448.0, + "completions/mean_terminated_length": 448.0, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.40269323003135954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.049643858103081584, + "learning_rate": 1.9382651462060643e-05, + "loss": 0.002, + "num_tokens": 18156390.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 302.625, + "completions/mean_terminated_length": 302.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.4028776978417266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.10876232432201505, + "learning_rate": 1.9381537180435103e-05, + "loss": 0.0044, + "num_tokens": 18164603.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 296.875, + "completions/mean_terminated_length": 296.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.4030621656520937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.04911126522347331, + "learning_rate": 1.9380421926194172e-05, + "loss": 0.002, + "num_tokens": 18173818.0, + "reward": 1.5401785373687744, + "reward_std": 0.6934295296669006, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6651785969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.40717756748199463, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.4032466334624608, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.036480508744716644, + "learning_rate": 1.9379305699453478e-05, + "loss": 0.0015, + "num_tokens": 18180454.0, + "reward": 1.951923131942749, + "reward_std": 0.13598209619522095, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9519230723381042, + "rewards/fixed_code_pass_all_test_reward/std": 0.13598206639289856, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 402.25, + "completions/mean_terminated_length": 402.25, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.4034311012728279, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.04836196266114712, + "learning_rate": 1.9378188500328746e-05, + "loss": 0.0019, + "num_tokens": 18188608.0, + "reward": 1.8888888359069824, + "reward_std": 0.11878276616334915, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.11878276616334915, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.40361556908319496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.061212467262521386, + "learning_rate": 1.937707032893579e-05, + "loss": 0.0024, + "num_tokens": 18197615.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 291.5, + "completions/mean_terminated_length": 291.5, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.4038000368935621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.048279145965352654, + "learning_rate": 1.9375951185390543e-05, + "loss": 0.0019, + "num_tokens": 18204115.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 239.625, + "completions/mean_terminated_length": 239.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.40398450470392916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.07347853295505047, + "learning_rate": 1.937483106980903e-05, + "loss": 0.0029, + "num_tokens": 18211536.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 263.0, + "completions/mean_terminated_length": 263.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.40416897251429623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05301134707406163, + "learning_rate": 1.9373709982307377e-05, + "loss": 0.0021, + "num_tokens": 18222008.0, + "reward": 1.9229323863983154, + "reward_std": 0.14270161092281342, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9229323267936707, + "rewards/fixed_code_pass_all_test_reward/std": 0.14270161092281342, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 316.25, + "completions/mean_terminated_length": 316.25, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.40435344032466336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.02576239500194788, + "learning_rate": 1.9372587923001807e-05, + "loss": 0.001, + "num_tokens": 18228930.0, + "reward": 1.3214285373687744, + "reward_std": 0.4644818902015686, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.03818017616868019, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.40453790813503043, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.06120583042502403, + "learning_rate": 1.937146489200865e-05, + "loss": 0.0024, + "num_tokens": 18238143.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 300.125, + "completions/mean_terminated_length": 300.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.4047223759453975, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.05549574992619455, + "learning_rate": 1.9370340889444333e-05, + "loss": 0.0022, + "num_tokens": 18248472.0, + "reward": 1.7828947305679321, + "reward_std": 0.40810689330101013, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7828947305679321, + "rewards/fixed_code_pass_all_test_reward/std": 0.4081069231033325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 174.125, + "completions/mean_terminated_length": 174.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.40490684375576463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.06633026199415326, + "learning_rate": 1.9369215915425388e-05, + "loss": 0.0027, + "num_tokens": 18252825.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 269.375, + "completions/mean_terminated_length": 269.375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.4050913115661317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.06826285785064101, + "learning_rate": 1.9368089970068442e-05, + "loss": 0.0027, + "num_tokens": 18261436.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 199.25, + "completions/mean_terminated_length": 199.25, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.4052757793764988, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.04750300641171634, + "learning_rate": 1.9366963053490227e-05, + "loss": 0.0019, + "num_tokens": 18265878.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 381.75, + "completions/mean_terminated_length": 381.75, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.4054602471868659, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.028336454997770488, + "learning_rate": 1.9365835165807576e-05, + "loss": 0.0011, + "num_tokens": 18272636.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 161.375, + "completions/mean_terminated_length": 161.375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.405644714997233, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.0360101864207536, + "learning_rate": 1.9364706307137417e-05, + "loss": 0.0014, + "num_tokens": 18276735.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 315.875, + "completions/mean_terminated_length": 315.875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.40582918280760005, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.04845994687639177, + "learning_rate": 1.9363576477596786e-05, + "loss": 0.0019, + "num_tokens": 18286206.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 302.0, + "completions/mean_terminated_length": 302.0, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.4060136506179672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.06846655998378992, + "learning_rate": 1.936244567730281e-05, + "loss": 0.0027, + "num_tokens": 18295550.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 299.75, + "completions/mean_terminated_length": 299.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.40619811842833425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.047910166904330254, + "learning_rate": 1.9361313906372734e-05, + "loss": 0.0019, + "num_tokens": 18305828.0, + "reward": 1.9577702283859253, + "reward_std": 0.08025240153074265, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9577702879905701, + "rewards/fixed_code_pass_all_test_reward/std": 0.08025235682725906, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 380.75, + "completions/mean_terminated_length": 380.75, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.4063825862387013, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.05310535826720297, + "learning_rate": 1.936018116492388e-05, + "loss": 0.0021, + "num_tokens": 18317386.0, + "reward": 1.478124976158142, + "reward_std": 0.25405198335647583, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4781250059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.25405198335647583, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 404.5, + "completions/mean_terminated_length": 404.5, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.40656705404906845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.06808144506067038, + "learning_rate": 1.9359047453073696e-05, + "loss": 0.0027, + "num_tokens": 18329022.0, + "reward": 1.2989130020141602, + "reward_std": 0.4338110387325287, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29891303181648254, + "rewards/fixed_code_pass_all_test_reward/std": 0.4338110685348511, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 356.0, + "completions/mean_terminated_length": 356.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.4067515218594355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0661726831458509, + "learning_rate": 1.9357912770939708e-05, + "loss": 0.0026, + "num_tokens": 18338350.0, + "reward": 1.2314815521240234, + "reward_std": 0.5052148103713989, + "rewards/fixed_code_pass_all_test_reward/mean": 0.35648149251937866, + "rewards/fixed_code_pass_all_test_reward/std": 0.16849380731582642, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 379.0, + "completions/mean_terminated_length": 379.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4069359896698026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.05881792004220188, + "learning_rate": 1.9356777118639552e-05, + "loss": 0.0024, + "num_tokens": 18350054.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 478.125, + "completions/mean_terminated_length": 478.125, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.40712045748016973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.03625512705184519, + "learning_rate": 1.9355640496290967e-05, + "loss": 0.0015, + "num_tokens": 18359567.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 398.5, + "completions/mean_terminated_length": 398.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.4073049252905368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.058654078748077154, + "learning_rate": 1.9354502904011794e-05, + "loss": 0.0023, + "num_tokens": 18371315.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 441.875, + "completions/mean_terminated_length": 441.875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.4074893931009039, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78125, + "kl": 0.02990041926386766, + "learning_rate": 1.9353364341919972e-05, + "loss": 0.0012, + "num_tokens": 18384418.0, + "reward": 1.9752066135406494, + "reward_std": 0.057935524731874466, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9752066135406494, + "rewards/fixed_code_pass_all_test_reward/std": 0.05793552100658417, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.407673860911271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.0631833306979388, + "learning_rate": 1.9352224810133532e-05, + "loss": 0.0025, + "num_tokens": 18394069.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 190.5, + "completions/mean_terminated_length": 190.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.4078583287216381, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.06377551006153226, + "learning_rate": 1.935108430877062e-05, + "loss": 0.0026, + "num_tokens": 18398457.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.40804279653200515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.337890625, + "kl": 0.07804432092234492, + "learning_rate": 1.934994283794947e-05, + "loss": 0.0031, + "num_tokens": 18407391.0, + "reward": 1.8947367668151855, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8947368264198303, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 448.75, + "completions/mean_terminated_length": 448.75, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.4082272643423723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.041587830521166325, + "learning_rate": 1.9348800397788424e-05, + "loss": 0.0017, + "num_tokens": 18416853.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 358.75, + "completions/mean_terminated_length": 358.75, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.40841173215273935, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.03245314140804112, + "learning_rate": 1.9347656988405925e-05, + "loss": 0.0013, + "num_tokens": 18426539.0, + "reward": 1.9888060092926025, + "reward_std": 0.020727353170514107, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9888059496879578, + "rewards/fixed_code_pass_all_test_reward/std": 0.020727327093482018, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 398.5, + "completions/mean_terminated_length": 398.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.4085961999631064, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.06698237918317318, + "learning_rate": 1.9346512609920515e-05, + "loss": 0.0027, + "num_tokens": 18434127.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 478.125, + "completions/mean_terminated_length": 478.125, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.40878066777347355, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.63671875, + "kl": 0.02205577283166349, + "learning_rate": 1.9345367262450827e-05, + "loss": 0.0009, + "num_tokens": 18447488.0, + "reward": 1.5340908765792847, + "reward_std": 0.44915637373924255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6590908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.21041364967823029, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 754.375, + "completions/mean_terminated_length": 323.16668701171875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.4089651355838406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.06076545245014131, + "learning_rate": 1.934422094611561e-05, + "loss": 0.0024, + "num_tokens": 18457299.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.4091496033942077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.06589684984646738, + "learning_rate": 1.9343073661033708e-05, + "loss": 0.0026, + "num_tokens": 18466626.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 364.75, + "completions/mean_terminated_length": 364.75, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.4093340712045748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.05481755780056119, + "learning_rate": 1.9341925407324064e-05, + "loss": 0.0022, + "num_tokens": 18473720.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 396.0, + "completions/mean_terminated_length": 396.0, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.4095185390149419, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.09364976780489087, + "learning_rate": 1.9340776185105712e-05, + "loss": 0.0037, + "num_tokens": 18484360.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 433.25, + "completions/mean_terminated_length": 433.25, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.40970300682530897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.06644648895598948, + "learning_rate": 1.9339625994497808e-05, + "loss": 0.0027, + "num_tokens": 18495594.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 166.0, + "completions/mean_terminated_length": 166.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.4098874746356761, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.04905774537473917, + "learning_rate": 1.933847483561959e-05, + "loss": 0.002, + "num_tokens": 18499634.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 357.375, + "completions/mean_terminated_length": 357.375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.41007194244604317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.04332609195262194, + "learning_rate": 1.9337322708590398e-05, + "loss": 0.0017, + "num_tokens": 18509373.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 198.875, + "completions/mean_terminated_length": 198.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.41025641025641024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.03939898288808763, + "learning_rate": 1.9336169613529686e-05, + "loss": 0.0016, + "num_tokens": 18513780.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 360.125, + "completions/mean_terminated_length": 360.125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.41044087806677737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.02126741223037243, + "learning_rate": 1.9335015550556994e-05, + "loss": 0.0009, + "num_tokens": 18520101.0, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.41062534587714444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.02886657346971333, + "learning_rate": 1.9333860519791968e-05, + "loss": 0.0012, + "num_tokens": 18525281.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 570.125, + "completions/mean_terminated_length": 570.125, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.4108098136875115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.053466693265363574, + "learning_rate": 1.9332704521354356e-05, + "loss": 0.0021, + "num_tokens": 18539554.0, + "reward": 1.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 491.375, + "completions/mean_terminated_length": 491.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.41099428149787864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.062037328723818064, + "learning_rate": 1.9331547555364e-05, + "loss": 0.0025, + "num_tokens": 18550101.0, + "reward": 1.65625, + "reward_std": 0.48065245151519775, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 385.0, + "completions/mean_terminated_length": 385.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.4111787493082457, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.07843072223477066, + "learning_rate": 1.9330389621940854e-05, + "loss": 0.0031, + "num_tokens": 18557197.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 995.75, + "completions/mean_terminated_length": 364.3999938964844, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.4113632171186128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.04001449604402296, + "learning_rate": 1.9329230721204956e-05, + "loss": 0.0016, + "num_tokens": 18569371.0, + "reward": 0.9204545617103577, + "reward_std": 0.8520237803459167, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.45259320735931396, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 209.75, + "completions/mean_terminated_length": 209.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.4115476849289799, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.04664748511277139, + "learning_rate": 1.9328070853276458e-05, + "loss": 0.0019, + "num_tokens": 18574225.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 217.625, + "completions/mean_terminated_length": 217.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.411732152739347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.039697568863630295, + "learning_rate": 1.932691001827561e-05, + "loss": 0.0016, + "num_tokens": 18579094.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 554.125, + "completions/mean_terminated_length": 554.125, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.41191662054971406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.042685813969001174, + "learning_rate": 1.932574821632275e-05, + "loss": 0.0017, + "num_tokens": 18594199.0, + "reward": 1.875, + "reward_std": 0.3162689208984375, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3162688910961151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 219.125, + "completions/mean_terminated_length": 219.125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.4121010883600812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.03532982035540044, + "learning_rate": 1.932458544753834e-05, + "loss": 0.0014, + "num_tokens": 18599080.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1160.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 384.875, + "completions/mean_terminated_length": 384.875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.41228555617044826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.1344591446686536, + "learning_rate": 1.9323421712042915e-05, + "loss": 0.0054, + "num_tokens": 18608015.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 249.5, + "completions/mean_terminated_length": 249.5, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.41247002398081534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.322265625, + "kl": 0.06811526301316917, + "learning_rate": 1.9322257009957132e-05, + "loss": 0.0027, + "num_tokens": 18613011.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 786.625, + "completions/mean_terminated_length": 606.4285888671875, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.41265449179118247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.369140625, + "kl": 0.046631124801933765, + "learning_rate": 1.932109134140173e-05, + "loss": 0.0019, + "num_tokens": 18625080.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 347.375, + "completions/mean_terminated_length": 347.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.41283895960154954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.07852643867954612, + "learning_rate": 1.9319924706497567e-05, + "loss": 0.0031, + "num_tokens": 18636387.0, + "reward": 1.357954502105713, + "reward_std": 0.14463543891906738, + "rewards/fixed_code_pass_all_test_reward/mean": 0.35795456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.14463548362255096, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 389.75, + "completions/mean_terminated_length": 389.75, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4130234274119166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.04115585068939254, + "learning_rate": 1.9318757105365594e-05, + "loss": 0.0016, + "num_tokens": 18646649.0, + "reward": 1.324013113975525, + "reward_std": 0.009789749048650265, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3240131735801697, + "rewards/fixed_code_pass_all_test_reward/std": 0.009789785370230675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 625.25, + "completions/mean_terminated_length": 422.0000305175781, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.41320789522228374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.62890625, + "kl": 0.0555822413880378, + "learning_rate": 1.9317588538126852e-05, + "loss": 0.0022, + "num_tokens": 18657763.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 183.25, + "completions/mean_terminated_length": 183.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4133923630326508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.026169362594373524, + "learning_rate": 1.931641900490249e-05, + "loss": 0.001, + "num_tokens": 18662189.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 349.75, + "completions/mean_terminated_length": 349.75, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.4135768308430179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.259765625, + "kl": 0.053370186826214194, + "learning_rate": 1.9315248505813763e-05, + "loss": 0.0021, + "num_tokens": 18671747.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 306.375, + "completions/mean_terminated_length": 306.375, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.41376129865338496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.03429671248886734, + "learning_rate": 1.931407704098202e-05, + "loss": 0.0014, + "num_tokens": 18678518.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 374.125, + "completions/mean_terminated_length": 374.125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.4139457664637521, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3046875, + "kl": 0.10347723960876465, + "learning_rate": 1.9312904610528708e-05, + "loss": 0.0041, + "num_tokens": 18687447.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 351.0, + "completions/mean_terminated_length": 351.0, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.41413023427411916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.02256230456987396, + "learning_rate": 1.931173121457538e-05, + "loss": 0.0009, + "num_tokens": 18694855.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 710.75, + "completions/mean_terminated_length": 519.7142944335938, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.41431470208448623, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37109375, + "kl": 0.052067259210161865, + "learning_rate": 1.931055685324368e-05, + "loss": 0.0021, + "num_tokens": 18709589.0, + "reward": 1.0967742204666138, + "reward_std": 0.4617042541503906, + "rewards/fixed_code_pass_all_test_reward/mean": 0.22177419066429138, + "rewards/fixed_code_pass_all_test_reward/std": 0.15750157833099365, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 554.875, + "completions/mean_terminated_length": 554.875, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.41449916989485336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.042478944407776, + "learning_rate": 1.9309381526655362e-05, + "loss": 0.0017, + "num_tokens": 18724412.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 424.625, + "completions/mean_terminated_length": 424.625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.41468363770522043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.032415792578831315, + "learning_rate": 1.930820523493228e-05, + "loss": 0.0013, + "num_tokens": 18734497.0, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 541.125, + "completions/mean_terminated_length": 541.125, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.4148681055155875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.051177745684981346, + "learning_rate": 1.9307027978196376e-05, + "loss": 0.002, + "num_tokens": 18743546.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 350.0, + "completions/mean_terminated_length": 350.0, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.41505257332595463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.03792436153162271, + "learning_rate": 1.9305849756569705e-05, + "loss": 0.0015, + "num_tokens": 18750290.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 516.375, + "completions/mean_terminated_length": 516.375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.4152370411363217, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.06106442678719759, + "learning_rate": 1.9304670570174414e-05, + "loss": 0.0024, + "num_tokens": 18763005.0, + "reward": 1.8333332538604736, + "reward_std": 0.25814807415008545, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.25814807415008545, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 268.875, + "completions/mean_terminated_length": 268.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.4154215089466888, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20703125, + "kl": 0.06257821898907423, + "learning_rate": 1.9303490419132758e-05, + "loss": 0.0025, + "num_tokens": 18770564.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 297.5, + "completions/mean_terminated_length": 297.5, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.4156059767570559, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.060645506251603365, + "learning_rate": 1.9302309303567085e-05, + "loss": 0.0024, + "num_tokens": 18776712.0, + "reward": 1.1145832538604736, + "reward_std": 0.062001921236515045, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1145833432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.06200198456645012, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1207.0, + "completions/max_terminated_length": 1207.0, + "completions/mean_length": 581.0, + "completions/mean_terminated_length": 581.0, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.415790444567423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.04547513881698251, + "learning_rate": 1.9301127223599843e-05, + "loss": 0.0018, + "num_tokens": 18787792.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 301.5, + "completions/mean_terminated_length": 301.5, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.41597491237779005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.06013173097744584, + "learning_rate": 1.9299944179353587e-05, + "loss": 0.0024, + "num_tokens": 18797772.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 268.0, + "completions/mean_terminated_length": 268.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.4161593801881572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.027040946180932224, + "learning_rate": 1.9298760170950964e-05, + "loss": 0.0011, + "num_tokens": 18803700.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 320.625, + "completions/mean_terminated_length": 320.625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.41634384799852425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.04632193874567747, + "learning_rate": 1.929757519851472e-05, + "loss": 0.0019, + "num_tokens": 18811937.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 329.875, + "completions/mean_terminated_length": 329.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.4165283158088913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.05699801770970225, + "learning_rate": 1.9296389262167712e-05, + "loss": 0.0023, + "num_tokens": 18820512.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 375.75, + "completions/mean_terminated_length": 375.75, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.41671278361925845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.051666841842234135, + "learning_rate": 1.9295202362032886e-05, + "loss": 0.0021, + "num_tokens": 18827862.0, + "reward": 1.5, + "reward_std": 0.3149183392524719, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.3149183392524719, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 191.125, + "completions/mean_terminated_length": 191.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4168972514296255, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.051860887790098786, + "learning_rate": 1.9294014498233294e-05, + "loss": 0.0021, + "num_tokens": 18832271.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 120.5, + "completions/mean_terminated_length": 120.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.4170817192399926, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.203125, + "kl": 0.10116012068465352, + "learning_rate": 1.9292825670892088e-05, + "loss": 0.004, + "num_tokens": 18836211.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 368.0, + "completions/mean_terminated_length": 368.0, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.4172661870503597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.05369422060903162, + "learning_rate": 1.929163588013251e-05, + "loss": 0.0021, + "num_tokens": 18844851.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 425.0, + "completions/mean_terminated_length": 425.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.4174506548607268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.037908138474449515, + "learning_rate": 1.9290445126077917e-05, + "loss": 0.0015, + "num_tokens": 18869059.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 446.375, + "completions/mean_terminated_length": 446.375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.4176351226710939, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.041092017432674766, + "learning_rate": 1.9289253408851758e-05, + "loss": 0.0016, + "num_tokens": 18877262.0, + "reward": 1.6578947305679321, + "reward_std": 0.10895779728889465, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6578947305679321, + "rewards/fixed_code_pass_all_test_reward/std": 0.10895773023366928, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 252.5, + "completions/mean_terminated_length": 252.5, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.417819590481461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.07798714423552155, + "learning_rate": 1.9288060728577575e-05, + "loss": 0.0031, + "num_tokens": 18885402.0, + "reward": 1.7321429252624512, + "reward_std": 0.3128393590450287, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.3128393292427063, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 341.625, + "completions/mean_terminated_length": 341.625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.4180040582918281, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.03691658528987318, + "learning_rate": 1.9286867085379027e-05, + "loss": 0.0015, + "num_tokens": 18895287.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 436.0, + "completions/mean_terminated_length": 205.71429443359375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.41818852610219515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.016175875207409263, + "learning_rate": 1.9285672479379856e-05, + "loss": 0.0006, + "num_tokens": 18902263.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 329.375, + "completions/mean_terminated_length": 329.375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.4183729939125623, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.953125, + "kl": 0.05496821540873498, + "learning_rate": 1.928447691070391e-05, + "loss": 0.0022, + "num_tokens": 18910514.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 392.0, + "completions/mean_terminated_length": 392.0, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.41855746172292935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.02638878033030778, + "learning_rate": 1.9283280379475145e-05, + "loss": 0.0011, + "num_tokens": 18918698.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.4187419295332964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.029873751453123987, + "learning_rate": 1.9282082885817607e-05, + "loss": 0.0012, + "num_tokens": 18922646.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 225.25, + "completions/mean_terminated_length": 225.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.41892639734366355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.057515296153724194, + "learning_rate": 1.9280884429855438e-05, + "loss": 0.0023, + "num_tokens": 18931840.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 200.5, + "completions/mean_terminated_length": 200.5, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4191108651540306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1826171875, + "kl": 0.15789741277694702, + "learning_rate": 1.9279685011712894e-05, + "loss": 0.0063, + "num_tokens": 18936404.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 354.875, + "completions/mean_terminated_length": 354.875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.4192953329643977, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.058952752500772476, + "learning_rate": 1.9278484631514316e-05, + "loss": 0.0024, + "num_tokens": 18945955.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 294.875, + "completions/mean_terminated_length": 294.875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.4194798007747648, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.03770574275404215, + "learning_rate": 1.9277283289384154e-05, + "loss": 0.0015, + "num_tokens": 18953802.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 313.625, + "completions/mean_terminated_length": 313.625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.4196642685851319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.08242155937477946, + "learning_rate": 1.927608098544696e-05, + "loss": 0.0033, + "num_tokens": 18961775.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 381.75, + "completions/mean_terminated_length": 381.75, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.41984873639549897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.034104855614714324, + "learning_rate": 1.9274877719827373e-05, + "loss": 0.0014, + "num_tokens": 18971525.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 438.5, + "completions/mean_terminated_length": 438.5, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.4200332042058661, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.04342841892503202, + "learning_rate": 1.927367349265014e-05, + "loss": 0.0017, + "num_tokens": 18979841.0, + "reward": 1.6136362552642822, + "reward_std": 0.0420827642083168, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6136363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.04208271950483322, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 452.625, + "completions/mean_terminated_length": 452.625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.42021767201623317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.05553004937246442, + "learning_rate": 1.9272468304040116e-05, + "loss": 0.0022, + "num_tokens": 18991678.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 351.125, + "completions/mean_terminated_length": 351.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.42040213982660024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.042978229466825724, + "learning_rate": 1.9271262154122238e-05, + "loss": 0.0017, + "num_tokens": 19001415.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1054.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 690.75, + "completions/mean_terminated_length": 690.75, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.42058660763696737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7578125, + "kl": 0.03954421426169574, + "learning_rate": 1.9270055043021556e-05, + "loss": 0.0016, + "num_tokens": 19015509.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 378.25, + "completions/mean_terminated_length": 378.25, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.42077107544733444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.061089978320524096, + "learning_rate": 1.926884697086321e-05, + "loss": 0.0024, + "num_tokens": 19024119.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.4209555432577015, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.03823255992028862, + "learning_rate": 1.9267637937772456e-05, + "loss": 0.0015, + "num_tokens": 19029932.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 298.625, + "completions/mean_terminated_length": 298.625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.42114001106806864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.05146806943230331, + "learning_rate": 1.9266427943874627e-05, + "loss": 0.0021, + "num_tokens": 19039257.0, + "reward": 1.58695650100708, + "reward_std": 0.2902688980102539, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5869565010070801, + "rewards/fixed_code_pass_all_test_reward/std": 0.2902688682079315, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 243.75, + "completions/mean_terminated_length": 243.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.4213244788784357, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.055736158741638064, + "learning_rate": 1.9265216989295174e-05, + "loss": 0.0022, + "num_tokens": 19044959.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 410.0, + "completions/mean_terminated_length": 410.0, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.4215089466888028, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.05023083230480552, + "learning_rate": 1.9264005074159633e-05, + "loss": 0.002, + "num_tokens": 19053023.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 146.125, + "completions/mean_terminated_length": 146.125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.4216934144991699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28515625, + "kl": 0.05534060252830386, + "learning_rate": 1.9262792198593657e-05, + "loss": 0.0022, + "num_tokens": 19057032.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 265.125, + "completions/mean_terminated_length": 265.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.421877882309537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.04976737545803189, + "learning_rate": 1.9261578362722986e-05, + "loss": 0.002, + "num_tokens": 19065185.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 304.75, + "completions/mean_terminated_length": 304.75, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.42206235011990406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.68359375, + "kl": 0.02806751464959234, + "learning_rate": 1.926036356667346e-05, + "loss": 0.0011, + "num_tokens": 19073151.0, + "reward": 1.9358108043670654, + "reward_std": 0.18155446648597717, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9358108043670654, + "rewards/fixed_code_pass_all_test_reward/std": 0.18155445158481598, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 479.75, + "completions/mean_terminated_length": 479.75, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.4222468179302712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.03395665902644396, + "learning_rate": 1.925914781057102e-05, + "loss": 0.0014, + "num_tokens": 19082581.0, + "reward": 1.7083333730697632, + "reward_std": 0.14337210357189178, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.1433720886707306, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 115.125, + "completions/mean_terminated_length": 115.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.42243128574063826, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.07398072304204106, + "learning_rate": 1.925793109454171e-05, + "loss": 0.003, + "num_tokens": 19086358.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 282.75, + "completions/mean_terminated_length": 282.75, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.42261575355100534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.05742534273304045, + "learning_rate": 1.925671341871167e-05, + "loss": 0.0023, + "num_tokens": 19092772.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 232.125, + "completions/mean_terminated_length": 232.125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.42280022136137246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.044009767938405275, + "learning_rate": 1.925549478320714e-05, + "loss": 0.0018, + "num_tokens": 19099301.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 290.75, + "completions/mean_terminated_length": 290.75, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.42298468917173954, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.03462363255675882, + "learning_rate": 1.9254275188154462e-05, + "loss": 0.0014, + "num_tokens": 19107627.0, + "reward": 1.7237085103988647, + "reward_std": 0.28233999013900757, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7237085103988647, + "rewards/fixed_code_pass_all_test_reward/std": 0.2823399305343628, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 166.125, + "completions/mean_terminated_length": 166.125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.4231691569821066, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.06847275653854012, + "learning_rate": 1.9253054633680074e-05, + "loss": 0.0027, + "num_tokens": 19111860.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 441.0, + "completions/mean_terminated_length": 441.0, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.42335362479247374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.019391046196687967, + "learning_rate": 1.925183311991052e-05, + "loss": 0.0008, + "num_tokens": 19120388.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 299.5, + "completions/mean_terminated_length": 299.5, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.4235380926028408, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.01970634318422526, + "learning_rate": 1.9250610646972428e-05, + "loss": 0.0008, + "num_tokens": 19126856.0, + "reward": 1.9500000476837158, + "reward_std": 0.09258202463388443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 95.5, + "completions/mean_terminated_length": 95.5, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.4237225604132079, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.1823858292773366, + "learning_rate": 1.9249387214992544e-05, + "loss": 0.0073, + "num_tokens": 19130308.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 384.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.423907028223575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.05038139899261296, + "learning_rate": 1.9248162824097703e-05, + "loss": 0.002, + "num_tokens": 19140788.0, + "reward": 1.0714285373687744, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 248.0, + "completions/mean_terminated_length": 248.0, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.4240914960339421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.0333156727720052, + "learning_rate": 1.924693747441484e-05, + "loss": 0.0013, + "num_tokens": 19146172.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 254.0, + "completions/mean_terminated_length": 254.0, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.42427596384430916, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.08257703110575676, + "learning_rate": 1.9245711166070995e-05, + "loss": 0.0033, + "num_tokens": 19152068.0, + "reward": 1.8636363744735718, + "reward_std": 0.2524963617324829, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8636363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.2524963915348053, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 868.75, + "completions/mean_terminated_length": 475.66668701171875, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "epoch": 0.4244604316546763, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41796875, + "kl": 0.01572364807361737, + "learning_rate": 1.92444838991933e-05, + "loss": 0.0006, + "num_tokens": 19164858.0, + "reward": 1.2604167461395264, + "reward_std": 0.784428060054779, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5104166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.3307189345359802, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.42464489946504336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.07347368029877543, + "learning_rate": 1.9243255673908994e-05, + "loss": 0.0029, + "num_tokens": 19169812.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 465.75, + "completions/mean_terminated_length": 465.75, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.42482936727541043, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.03824932366842404, + "learning_rate": 1.9242026490345406e-05, + "loss": 0.0015, + "num_tokens": 19180546.0, + "reward": 1.1666665077209473, + "reward_std": 0.03636966645717621, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.036369647830724716, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 268.0, + "completions/mean_terminated_length": 268.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.42501383508577756, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.07914357958361506, + "learning_rate": 1.9240796348629972e-05, + "loss": 0.0032, + "num_tokens": 19186810.0, + "reward": 1.7999999523162842, + "reward_std": 0.38544961810112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 324.5, + "completions/mean_terminated_length": 324.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.42519830289614463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.03608485125005245, + "learning_rate": 1.9239565248890225e-05, + "loss": 0.0014, + "num_tokens": 19195534.0, + "reward": 1.524999976158142, + "reward_std": 0.22730305790901184, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5249999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.22730302810668945, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 111.75, + "completions/mean_terminated_length": 111.75, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.4253827707065117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2333984375, + "kl": 0.08649184228852391, + "learning_rate": 1.9238333191253797e-05, + "loss": 0.0035, + "num_tokens": 19199244.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 580.375, + "completions/mean_terminated_length": 580.375, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.42556723851687883, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.035986074479296803, + "learning_rate": 1.923710017584842e-05, + "loss": 0.0014, + "num_tokens": 19214975.0, + "reward": 1.0299999713897705, + "reward_std": 0.08485280722379684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.029999999329447746, + "rewards/fixed_code_pass_all_test_reward/std": 0.08485281467437744, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 278.625, + "completions/mean_terminated_length": 278.625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4257517063272459, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.05489029781892896, + "learning_rate": 1.9235866202801924e-05, + "loss": 0.0022, + "num_tokens": 19223380.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 362.0, + "completions/mean_terminated_length": 362.0, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.425936174137613, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.08160588308237493, + "learning_rate": 1.9234631272242243e-05, + "loss": 0.0033, + "num_tokens": 19234676.0, + "reward": 1.8858695030212402, + "reward_std": 0.3228096067905426, + "rewards/fixed_code_pass_all_test_reward/mean": 0.885869562625885, + "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 336.0, + "completions/mean_terminated_length": 336.0, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.42612064194798005, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04796403902582824, + "learning_rate": 1.92333953842974e-05, + "loss": 0.0019, + "num_tokens": 19241908.0, + "reward": 1.4464285373687744, + "reward_std": 0.3926251530647278, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4464285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.39262518286705017, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 454.875, + "completions/mean_terminated_length": 454.875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.4263051097583472, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.014643264235928655, + "learning_rate": 1.9232158539095526e-05, + "loss": 0.0006, + "num_tokens": 19253283.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 212.5, + "completions/mean_terminated_length": 212.5, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.42648957756871425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.07480897568166256, + "learning_rate": 1.923092073676485e-05, + "loss": 0.003, + "num_tokens": 19260239.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.4266740453790813, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.1875, + "kl": 0.06601953622885048, + "learning_rate": 1.92296819774337e-05, + "loss": 0.0026, + "num_tokens": 19264959.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.42685851318944845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.05821204977110028, + "learning_rate": 1.9228442261230503e-05, + "loss": 0.0023, + "num_tokens": 19273735.0, + "reward": 1.1966667175292969, + "reward_std": 0.04320497065782547, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1966666579246521, + "rewards/fixed_code_pass_all_test_reward/std": 0.04320494830608368, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 460.625, + "completions/mean_terminated_length": 460.625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.4270429809998155, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.07222754089161754, + "learning_rate": 1.9227201588283776e-05, + "loss": 0.0029, + "num_tokens": 19282044.0, + "reward": 1.9302325248718262, + "reward_std": 0.17884741723537445, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9302325248718262, + "rewards/fixed_code_pass_all_test_reward/std": 0.17884741723537445, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 336.75, + "completions/mean_terminated_length": 336.75, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.4272274488101826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.04622276150621474, + "learning_rate": 1.9225959958722156e-05, + "loss": 0.0018, + "num_tokens": 19291714.0, + "reward": 1.8026316165924072, + "reward_std": 0.2893027663230896, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8026316165924072, + "rewards/fixed_code_pass_all_test_reward/std": 0.2893027663230896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 255.625, + "completions/mean_terminated_length": 255.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.4274119166205497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.04531836276873946, + "learning_rate": 1.922471737267436e-05, + "loss": 0.0018, + "num_tokens": 19298159.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 449.375, + "completions/mean_terminated_length": 449.375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.4275963844309168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.035482348408550024, + "learning_rate": 1.922347383026921e-05, + "loss": 0.0014, + "num_tokens": 19307298.0, + "reward": 1.2395832538604736, + "reward_std": 0.029462814331054688, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2395833432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.029462780803442, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 412.125, + "completions/mean_terminated_length": 412.125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.42778085224128387, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.07914287154562771, + "learning_rate": 1.922222933163563e-05, + "loss": 0.0032, + "num_tokens": 19315259.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 162.5, + "completions/mean_terminated_length": 162.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.427965320051651, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.060592830181121826, + "learning_rate": 1.9220983876902647e-05, + "loss": 0.0024, + "num_tokens": 19319463.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 468.125, + "completions/mean_terminated_length": 468.125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.42814978786201807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.03913731896318495, + "learning_rate": 1.921973746619937e-05, + "loss": 0.0016, + "num_tokens": 19327928.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 277.75, + "completions/mean_terminated_length": 277.75, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.42833425567238514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.09962846525013447, + "learning_rate": 1.921849009965503e-05, + "loss": 0.004, + "num_tokens": 19334262.0, + "reward": 1.0178570747375488, + "reward_std": 0.05050762742757797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 594.5, + "completions/mean_terminated_length": 594.5, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.4285187234827523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.036094429437071085, + "learning_rate": 1.921724177739894e-05, + "loss": 0.0014, + "num_tokens": 19344754.0, + "reward": 1.8060344457626343, + "reward_std": 0.25402316451072693, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8060344457626343, + "rewards/fixed_code_pass_all_test_reward/std": 0.25402316451072693, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 451.875, + "completions/mean_terminated_length": 451.875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.42870319129311935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.0469690237659961, + "learning_rate": 1.9215992499560515e-05, + "loss": 0.0019, + "num_tokens": 19354537.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 413.125, + "completions/mean_terminated_length": 413.125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.4288876591034864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.043040668591856956, + "learning_rate": 1.9214742266269275e-05, + "loss": 0.0017, + "num_tokens": 19364002.0, + "reward": 1.359375, + "reward_std": 0.4470679461956024, + "rewards/fixed_code_pass_all_test_reward/mean": 0.359375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4470680058002472, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 346.75, + "completions/mean_terminated_length": 346.75, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.42907212691385355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.05405280482955277, + "learning_rate": 1.921349107765484e-05, + "loss": 0.0022, + "num_tokens": 19371440.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 343.375, + "completions/mean_terminated_length": 343.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.4292565947242206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.053432710003107786, + "learning_rate": 1.9212238933846915e-05, + "loss": 0.0021, + "num_tokens": 19382979.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 291.125, + "completions/mean_terminated_length": 291.125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4294410625345877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.051974230678752065, + "learning_rate": 1.9210985834975323e-05, + "loss": 0.0021, + "num_tokens": 19390820.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 478.125, + "completions/mean_terminated_length": 478.125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.4296255303449548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0361328125, + "kl": 0.022210419527255, + "learning_rate": 1.9209731781169974e-05, + "loss": 0.0009, + "num_tokens": 19400085.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 360.25, + "completions/mean_terminated_length": 360.25, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.4298099981553219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.06940677133388817, + "learning_rate": 1.920847677256088e-05, + "loss": 0.0028, + "num_tokens": 19408927.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 768.875, + "completions/mean_terminated_length": 586.1428833007812, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.42999446596568897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.50390625, + "kl": 0.02712810243247077, + "learning_rate": 1.9207220809278154e-05, + "loss": 0.0011, + "num_tokens": 19423158.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 328.5, + "completions/mean_terminated_length": 328.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.4301789337760561, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.0771363670937717, + "learning_rate": 1.9205963891452e-05, + "loss": 0.0031, + "num_tokens": 19429026.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 645.5, + "completions/mean_terminated_length": 645.5, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.43036340158642317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "kl": 0.045490658143535256, + "learning_rate": 1.920470601921273e-05, + "loss": 0.0018, + "num_tokens": 19439534.0, + "reward": 1.3833333253860474, + "reward_std": 0.24364951252937317, + "rewards/fixed_code_pass_all_test_reward/mean": 0.38333332538604736, + "rewards/fixed_code_pass_all_test_reward/std": 0.24364949762821198, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 385.375, + "completions/mean_terminated_length": 385.375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.43054786939679024, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2236328125, + "kl": 0.06689278548583388, + "learning_rate": 1.9203447192690754e-05, + "loss": 0.0027, + "num_tokens": 19447081.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 444.0, + "completions/mean_terminated_length": 444.0, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.43073233720715737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.0388527640607208, + "learning_rate": 1.9202187412016577e-05, + "loss": 0.0016, + "num_tokens": 19455641.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.43091680501752444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.048122330103069544, + "learning_rate": 1.9200926677320805e-05, + "loss": 0.0019, + "num_tokens": 19461361.0, + "reward": 1.9464285373687744, + "reward_std": 0.1062890887260437, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.10628911107778549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 519.125, + "completions/mean_terminated_length": 519.125, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.4311012728278915, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.04164264351129532, + "learning_rate": 1.919966498873414e-05, + "loss": 0.0017, + "num_tokens": 19470378.0, + "reward": 1.3068181276321411, + "reward_std": 0.19998523592948914, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3068181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.19998525083065033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 185.5, + "completions/mean_terminated_length": 185.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.43128574063825864, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.07707827165722847, + "learning_rate": 1.919840234638739e-05, + "loss": 0.0031, + "num_tokens": 19474862.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.4314702084486257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.04515744931995869, + "learning_rate": 1.9197138750411452e-05, + "loss": 0.0018, + "num_tokens": 19479594.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 716.125, + "completions/mean_terminated_length": 525.857177734375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.4316546762589928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7890625, + "kl": 0.057672600261867046, + "learning_rate": 1.9195874200937336e-05, + "loss": 0.0023, + "num_tokens": 19491403.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 352.375, + "completions/mean_terminated_length": 352.375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.4318391440693599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.04179658833891153, + "learning_rate": 1.919460869809613e-05, + "loss": 0.0017, + "num_tokens": 19500070.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 472.75, + "completions/mean_terminated_length": 472.75, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.432023611879727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.07956786081194878, + "learning_rate": 1.9193342242019044e-05, + "loss": 0.0032, + "num_tokens": 19508636.0, + "reward": 1.5416666269302368, + "reward_std": 0.22072911262512207, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.22072911262512207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.43220807969009406, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.06560546555556357, + "learning_rate": 1.9192074832837367e-05, + "loss": 0.0026, + "num_tokens": 19518873.0, + "reward": 1.7791666984558105, + "reward_std": 0.05892553552985191, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7791666984558105, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 303.5, + "completions/mean_terminated_length": 303.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.4323925475004612, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06103515625, + "kl": 0.047562207211740315, + "learning_rate": 1.9190806470682503e-05, + "loss": 0.0019, + "num_tokens": 19526997.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 409.875, + "completions/mean_terminated_length": 409.875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.43257701531082826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.03533302713185549, + "learning_rate": 1.9189537155685944e-05, + "loss": 0.0014, + "num_tokens": 19534244.0, + "reward": 1.5, + "reward_std": 0.7292091846466064, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.417855441570282, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 325.125, + "completions/mean_terminated_length": 325.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.43276148312119533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.0498668325599283, + "learning_rate": 1.9188266887979287e-05, + "loss": 0.002, + "num_tokens": 19540757.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 894.875, + "completions/mean_terminated_length": 510.5, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.43294595093156246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6484375, + "kl": 0.0201860044762725, + "learning_rate": 1.9186995667694216e-05, + "loss": 0.0008, + "num_tokens": 19554044.0, + "reward": 1.25, + "reward_std": 0.8864052295684814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 799.75, + "completions/mean_terminated_length": 799.75, + "completions/min_length": 695.0, + "completions/min_terminated_length": 695.0, + "epoch": 0.43313041874192953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7109375, + "kl": 0.030497669242322445, + "learning_rate": 1.918572349496253e-05, + "loss": 0.0012, + "num_tokens": 19571522.0, + "reward": 1.2916667461395264, + "reward_std": 0.3063361644744873, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.3063361942768097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 478.875, + "completions/mean_terminated_length": 478.875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.4333148865522966, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.061279186280444264, + "learning_rate": 1.9184450369916123e-05, + "loss": 0.0025, + "num_tokens": 19580401.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 363.75, + "completions/mean_terminated_length": 363.75, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.43349935436266374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.03851740341633558, + "learning_rate": 1.9183176292686974e-05, + "loss": 0.0015, + "num_tokens": 19589751.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 462.625, + "completions/mean_terminated_length": 462.625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.4336838221730308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.06285226996988058, + "learning_rate": 1.9181901263407178e-05, + "loss": 0.0025, + "num_tokens": 19600252.0, + "reward": 1.6428570747375488, + "reward_std": 0.3972390294075012, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6428571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.3972390294075012, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 404.75, + "completions/mean_terminated_length": 404.75, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4338682899833979, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.054634854197502136, + "learning_rate": 1.918062528220892e-05, + "loss": 0.0022, + "num_tokens": 19624338.0, + "reward": 1.7981927394866943, + "reward_std": 0.37864968180656433, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7981927394866943, + "rewards/fixed_code_pass_all_test_reward/std": 0.37864968180656433, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 232.75, + "completions/mean_terminated_length": 232.75, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.434052757793765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.04426986863836646, + "learning_rate": 1.9179348349224483e-05, + "loss": 0.0018, + "num_tokens": 19629152.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 328.0, + "completions/mean_terminated_length": 328.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.4342372256041321, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.578125, + "kl": 0.042994947521947324, + "learning_rate": 1.9178070464586255e-05, + "loss": 0.0017, + "num_tokens": 19639416.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 379.125, + "completions/mean_terminated_length": 379.125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.43442169341449915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0721307871863246, + "learning_rate": 1.9176791628426718e-05, + "loss": 0.0029, + "num_tokens": 19648121.0, + "reward": 0.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 651.875, + "completions/mean_terminated_length": 452.4285888671875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.4346061612248663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.06704360741423443, + "learning_rate": 1.9175511840878446e-05, + "loss": 0.0027, + "num_tokens": 19660784.0, + "reward": 0.9375, + "reward_std": 0.3788071870803833, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, + "rewards/fixed_code_pass_all_test_reward/std": 0.025253813713788986, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 321.25, + "completions/mean_terminated_length": 321.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.43479062903523336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.06823410396464169, + "learning_rate": 1.917423110207413e-05, + "loss": 0.0027, + "num_tokens": 19669138.0, + "reward": 1.0840909481048584, + "reward_std": 0.01928478479385376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08409091085195541, + "rewards/fixed_code_pass_all_test_reward/std": 0.01928473263978958, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 318.375, + "completions/mean_terminated_length": 318.375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.43497509684560043, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.07440023776143789, + "learning_rate": 1.9172949412146542e-05, + "loss": 0.003, + "num_tokens": 19675701.0, + "reward": 1.59375, + "reward_std": 0.0578637570142746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 310.25, + "completions/mean_terminated_length": 310.25, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.43515956465596756, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.04279282200150192, + "learning_rate": 1.917166677122856e-05, + "loss": 0.0017, + "num_tokens": 19682511.0, + "reward": 1.625, + "reward_std": 0.2781743109226227, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2781743109226227, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 357.5, + "completions/mean_terminated_length": 357.5, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.43534403246633463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.08303224807605147, + "learning_rate": 1.9170383179453158e-05, + "loss": 0.0033, + "num_tokens": 19692043.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 353.25, + "completions/mean_terminated_length": 353.25, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.4355285002767017, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.04342034482397139, + "learning_rate": 1.916909863695341e-05, + "loss": 0.0017, + "num_tokens": 19698117.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 408.5, + "completions/mean_terminated_length": 408.5, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.43571296808706883, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.03608640073798597, + "learning_rate": 1.9167813143862497e-05, + "loss": 0.0014, + "num_tokens": 19706425.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 219.625, + "completions/mean_terminated_length": 219.625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.4358974358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.07215271471068263, + "learning_rate": 1.9166526700313683e-05, + "loss": 0.0029, + "num_tokens": 19714470.0, + "reward": 1.0535714626312256, + "reward_std": 0.0739356130361557, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 118.0, + "completions/mean_terminated_length": 118.0, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.436081903707803, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.10744597436860204, + "learning_rate": 1.9165239306440336e-05, + "loss": 0.0043, + "num_tokens": 19718286.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 373.875, + "completions/mean_terminated_length": 373.875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.4362663715181701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.03611526032909751, + "learning_rate": 1.916395096237593e-05, + "loss": 0.0014, + "num_tokens": 19728413.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 397.375, + "completions/mean_terminated_length": 397.375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.4364508393285372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.045328846434131265, + "learning_rate": 1.916266166825403e-05, + "loss": 0.0018, + "num_tokens": 19737520.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 426.625, + "completions/mean_terminated_length": 426.625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.43663530713890425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.057888827519491315, + "learning_rate": 1.9161371424208296e-05, + "loss": 0.0023, + "num_tokens": 19748069.0, + "reward": 1.6938775777816772, + "reward_std": 0.6113696098327637, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8188775777816772, + "rewards/fixed_code_pass_all_test_reward/std": 0.3375398814678192, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 290.125, + "completions/mean_terminated_length": 290.125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4368197749492714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.08168609626591206, + "learning_rate": 1.9160080230372502e-05, + "loss": 0.0033, + "num_tokens": 19755942.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 267.375, + "completions/mean_terminated_length": 267.375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.43700424275963845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.07749372208490968, + "learning_rate": 1.9158788086880502e-05, + "loss": 0.0031, + "num_tokens": 19761113.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 215.5, + "completions/mean_terminated_length": 215.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4371887105700055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.038680328289046884, + "learning_rate": 1.9157494993866262e-05, + "loss": 0.0015, + "num_tokens": 19768133.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 425.625, + "completions/mean_terminated_length": 425.625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.43737317838037265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.027796739479526877, + "learning_rate": 1.915620095146384e-05, + "loss": 0.0011, + "num_tokens": 19776906.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 381.5, + "completions/mean_terminated_length": 381.5, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.4375576461907397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.020519911544397473, + "learning_rate": 1.9154905959807394e-05, + "loss": 0.0008, + "num_tokens": 19784958.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 147.0, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.4377421140011068, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.09658517176285386, + "learning_rate": 1.9153610019031177e-05, + "loss": 0.0039, + "num_tokens": 19788862.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 303.75, + "completions/mean_terminated_length": 303.75, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.4379265818114739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.05363936512731016, + "learning_rate": 1.9152313129269545e-05, + "loss": 0.0021, + "num_tokens": 19798524.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.438111049621841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.1740614203736186, + "learning_rate": 1.9151015290656955e-05, + "loss": 0.007, + "num_tokens": 19804424.0, + "reward": 1.8857526779174805, + "reward_std": 0.3231402039527893, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8857526779174805, + "rewards/fixed_code_pass_all_test_reward/std": 0.3231402337551117, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 238.125, + "completions/mean_terminated_length": 238.125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.43829551743220807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.041146688628941774, + "learning_rate": 1.914971650332795e-05, + "loss": 0.0016, + "num_tokens": 19809361.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 237.625, + "completions/mean_terminated_length": 237.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.43847998524257514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.04686918528750539, + "learning_rate": 1.9148416767417188e-05, + "loss": 0.0019, + "num_tokens": 19814150.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.43866445305294227, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05641306610777974, + "learning_rate": 1.9147116083059413e-05, + "loss": 0.0023, + "num_tokens": 19820443.0, + "reward": 1.8016917705535889, + "reward_std": 0.29313498735427856, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8016917705535889, + "rewards/fixed_code_pass_all_test_reward/std": 0.2931349575519562, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 287.875, + "completions/mean_terminated_length": 287.875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.43884892086330934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.03580252768006176, + "learning_rate": 1.9145814450389472e-05, + "loss": 0.0014, + "num_tokens": 19829930.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.4390333886736764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.055946134962141514, + "learning_rate": 1.9144511869542312e-05, + "loss": 0.0022, + "num_tokens": 19834183.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 470.25, + "completions/mean_terminated_length": 470.25, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.43921785648404355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.03587186196818948, + "learning_rate": 1.914320834065297e-05, + "loss": 0.0014, + "num_tokens": 19843249.0, + "reward": 1.6120129823684692, + "reward_std": 0.07649830728769302, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6120129823684692, + "rewards/fixed_code_pass_all_test_reward/std": 0.07649827003479004, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 287.625, + "completions/mean_terminated_length": 287.625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.4394023242944106, + "frac_reward_zero_std": 0.0, + "grad_norm": 34.75, + "kl": 0.2773497684393078, + "learning_rate": 1.914190386385659e-05, + "loss": 0.0111, + "num_tokens": 19849790.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 230.25, + "completions/mean_terminated_length": 230.25, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.4395867921047777, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.032679900992661715, + "learning_rate": 1.9140598439288412e-05, + "loss": 0.0013, + "num_tokens": 19855544.0, + "reward": 1.440000057220459, + "reward_std": 0.3038797080516815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4399999976158142, + "rewards/fixed_code_pass_all_test_reward/std": 0.30387967824935913, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 453.375, + "completions/mean_terminated_length": 453.375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.4397712599151448, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.05411852872930467, + "learning_rate": 1.9139292067083776e-05, + "loss": 0.0022, + "num_tokens": 19867635.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 233.0, + "completions/mean_terminated_length": 233.0, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.4399557277255119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20703125, + "kl": 0.05432027648203075, + "learning_rate": 1.9137984747378117e-05, + "loss": 0.0022, + "num_tokens": 19872507.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.44014019553587896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.07272415515035391, + "learning_rate": 1.9136676480306967e-05, + "loss": 0.0029, + "num_tokens": 19877521.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.4403246633462461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.03546681790612638, + "learning_rate": 1.9135367266005957e-05, + "loss": 0.0014, + "num_tokens": 19887230.0, + "reward": 1.8181818723678589, + "reward_std": 0.13571275770664215, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.13571275770664215, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 221.5, + "completions/mean_terminated_length": 221.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.44050913115661317, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.08220332162454724, + "learning_rate": 1.913405710461082e-05, + "loss": 0.0033, + "num_tokens": 19897658.0, + "reward": 1.0138888359069824, + "reward_std": 0.03928373008966446, + "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, + "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 302.875, + "completions/mean_terminated_length": 302.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.44069359896698024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.06648998102173209, + "learning_rate": 1.9132745996257388e-05, + "loss": 0.0027, + "num_tokens": 19908681.0, + "reward": 1.3914473056793213, + "reward_std": 0.15816861391067505, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39144736528396606, + "rewards/fixed_code_pass_all_test_reward/std": 0.15816862881183624, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.44087806677734737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.050412053940817714, + "learning_rate": 1.9131433941081585e-05, + "loss": 0.002, + "num_tokens": 19918490.0, + "reward": 1.9249999523162842, + "reward_std": 0.2121320217847824, + "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 702.875, + "completions/mean_terminated_length": 702.875, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "epoch": 0.44106253458771444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66015625, + "kl": 0.030703907483257353, + "learning_rate": 1.9130120939219436e-05, + "loss": 0.0012, + "num_tokens": 19935185.0, + "reward": 1.5, + "reward_std": 0.4655483365058899, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.4655483663082123, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.4412470023980815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.03788535506464541, + "learning_rate": 1.912880699080706e-05, + "loss": 0.0015, + "num_tokens": 19939370.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 202.75, + "completions/mean_terminated_length": 202.75, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.44143147020844864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.06428035930730402, + "learning_rate": 1.9127492095980685e-05, + "loss": 0.0026, + "num_tokens": 19946760.0, + "reward": 1.9406249523162842, + "reward_std": 0.16793784499168396, + "rewards/fixed_code_pass_all_test_reward/mean": 0.940625011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.16793787479400635, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 291.5, + "completions/mean_terminated_length": 291.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4416159380188157, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.04636247269809246, + "learning_rate": 1.9126176254876633e-05, + "loss": 0.0019, + "num_tokens": 19953196.0, + "reward": 1.0192307233810425, + "reward_std": 0.4121977686882019, + "rewards/fixed_code_pass_all_test_reward/mean": 0.14423076808452606, + "rewards/fixed_code_pass_all_test_reward/std": 0.06081303581595421, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 232.5, + "completions/mean_terminated_length": 232.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.4418004058291828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.029619957669638097, + "learning_rate": 1.912485946763131e-05, + "loss": 0.0012, + "num_tokens": 19957936.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 403.75, + "completions/mean_terminated_length": 403.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.4419848736395499, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.07484762789681554, + "learning_rate": 1.9123541734381244e-05, + "loss": 0.003, + "num_tokens": 19968470.0, + "reward": 1.3333333730697632, + "reward_std": 0.3563483655452728, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.3563483655452728, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 493.0, + "completions/mean_terminated_length": 493.0, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.442169341449917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.045036432798951864, + "learning_rate": 1.9122223055263043e-05, + "loss": 0.0018, + "num_tokens": 19978118.0, + "reward": 1.774999976158142, + "reward_std": 0.37701839208602905, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.37701839208602905, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 202.375, + "completions/mean_terminated_length": 202.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.44235380926028406, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.05718409572727978, + "learning_rate": 1.912090343041342e-05, + "loss": 0.0023, + "num_tokens": 19982497.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 336.75, + "completions/mean_terminated_length": 336.75, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.4425382770706512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.08714699000120163, + "learning_rate": 1.911958285996918e-05, + "loss": 0.0035, + "num_tokens": 19995127.0, + "reward": 1.899999976158142, + "reward_std": 0.21380899846553802, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.21380899846553802, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 228.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.44272274488101826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.053044983418658376, + "learning_rate": 1.9118261344067236e-05, + "loss": 0.0021, + "num_tokens": 20000874.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 364.375, + "completions/mean_terminated_length": 364.375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.44290721269138533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.04983460553921759, + "learning_rate": 1.9116938882844596e-05, + "loss": 0.002, + "num_tokens": 20014149.0, + "reward": 1.170258641242981, + "reward_std": 0.33526611328125, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17025862634181976, + "rewards/fixed_code_pass_all_test_reward/std": 0.3352661430835724, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 338.5, + "completions/mean_terminated_length": 338.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.44309168050175246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.07310786168090999, + "learning_rate": 1.9115615476438362e-05, + "loss": 0.0029, + "num_tokens": 20024089.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 122.375, + "completions/mean_terminated_length": 122.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.44327614831211953, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.173828125, + "kl": 0.040407335152849555, + "learning_rate": 1.9114291124985732e-05, + "loss": 0.0016, + "num_tokens": 20027804.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 202.875, + "completions/mean_terminated_length": 202.875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.4434606161224866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.04502365714870393, + "learning_rate": 1.911296582862401e-05, + "loss": 0.0018, + "num_tokens": 20035195.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 234.0, + "completions/mean_terminated_length": 234.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.44364508393285373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.041253834031522274, + "learning_rate": 1.9111639587490595e-05, + "loss": 0.0017, + "num_tokens": 20046387.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.4438295517432208, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6171875, + "kl": 0.10233459621667862, + "learning_rate": 1.911031240172298e-05, + "loss": 0.0041, + "num_tokens": 20055140.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 305.25, + "completions/mean_terminated_length": 305.25, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.4440140195535879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.244140625, + "kl": 0.07702045887708664, + "learning_rate": 1.9108984271458758e-05, + "loss": 0.0031, + "num_tokens": 20065054.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.444198487363955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.03391783335246146, + "learning_rate": 1.9107655196835627e-05, + "loss": 0.0014, + "num_tokens": 20071474.0, + "reward": 1.84375, + "reward_std": 0.2893187701702118, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 235.75, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.4443829551743221, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.08611002354882658, + "learning_rate": 1.910632517799137e-05, + "loss": 0.0034, + "num_tokens": 20082280.0, + "reward": 1.7300000190734863, + "reward_std": 0.3726353943347931, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7300000190734863, + "rewards/fixed_code_pass_all_test_reward/std": 0.3726354241371155, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 237.875, + "completions/mean_terminated_length": 237.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.44456742298468915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.06404017843306065, + "learning_rate": 1.9104994215063876e-05, + "loss": 0.0026, + "num_tokens": 20088191.0, + "reward": 1.6273585557937622, + "reward_std": 0.22579647600650787, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6273584961891174, + "rewards/fixed_code_pass_all_test_reward/std": 0.22579655051231384, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 349.625, + "completions/mean_terminated_length": 349.625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.4447518907950563, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.05312766553834081, + "learning_rate": 1.910366230819113e-05, + "loss": 0.0021, + "num_tokens": 20095316.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 332.75, + "completions/mean_terminated_length": 332.75, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.44493635860542335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.028320763492956758, + "learning_rate": 1.9102329457511217e-05, + "loss": 0.0011, + "num_tokens": 20102418.0, + "reward": 1.9090909957885742, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 263.5, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.4451208264157904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.05380662437528372, + "learning_rate": 1.9100995663162317e-05, + "loss": 0.0022, + "num_tokens": 20110430.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 289.75, + "completions/mean_terminated_length": 289.75, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.44530529422615756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.04163537628483027, + "learning_rate": 1.909966092528271e-05, + "loss": 0.0017, + "num_tokens": 20120828.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.44548976203652463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.04899488063529134, + "learning_rate": 1.9098325244010773e-05, + "loss": 0.002, + "num_tokens": 20125148.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 490.875, + "completions/mean_terminated_length": 490.875, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.4456742298468917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.05311124725267291, + "learning_rate": 1.9096988619484977e-05, + "loss": 0.0021, + "num_tokens": 20134259.0, + "reward": 1.3214285373687744, + "reward_std": 0.6296836137771606, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4464285671710968, + "rewards/fixed_code_pass_all_test_reward/std": 0.3794080317020416, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 528.875, + "completions/mean_terminated_length": 528.875, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.44585869765725883, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.023211789783090353, + "learning_rate": 1.90956510518439e-05, + "loss": 0.0009, + "num_tokens": 20142970.0, + "reward": 1.375, + "reward_std": 0.40089187026023865, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.40089187026023865, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 346.0, + "completions/mean_terminated_length": 346.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.4460431654676259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.06979157985188067, + "learning_rate": 1.9094312541226207e-05, + "loss": 0.0028, + "num_tokens": 20149770.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 133.125, + "completions/mean_terminated_length": 133.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.446227633277993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.06892778677865863, + "learning_rate": 1.909297308777067e-05, + "loss": 0.0028, + "num_tokens": 20153699.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 207.25, + "completions/mean_terminated_length": 207.25, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.4464121010883601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.03642709576524794, + "learning_rate": 1.909163269161615e-05, + "loss": 0.0015, + "num_tokens": 20158269.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.4465965688987272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48828125, + "kl": 0.10374970780685544, + "learning_rate": 1.9090291352901615e-05, + "loss": 0.0041, + "num_tokens": 20162591.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.44678103670909425, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.59375, + "kl": 0.06303893262520432, + "learning_rate": 1.9088949071766124e-05, + "loss": 0.0025, + "num_tokens": 20172038.0, + "reward": 1.8430233001708984, + "reward_std": 0.21664845943450928, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8430233001708984, + "rewards/fixed_code_pass_all_test_reward/std": 0.21664850413799286, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.4469655045194614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.04099803953431547, + "learning_rate": 1.9087605848348834e-05, + "loss": 0.0016, + "num_tokens": 20177364.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 431.125, + "completions/mean_terminated_length": 431.125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.44714997232982845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.045270358212292194, + "learning_rate": 1.9086261682789004e-05, + "loss": 0.0018, + "num_tokens": 20187005.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 232.375, + "completions/mean_terminated_length": 232.375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.4473344401401955, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.04848519433289766, + "learning_rate": 1.9084916575225988e-05, + "loss": 0.0019, + "num_tokens": 20195752.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 300.5, + "completions/mean_terminated_length": 300.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.44751890795056265, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.06755637563765049, + "learning_rate": 1.908357052579924e-05, + "loss": 0.0027, + "num_tokens": 20205452.0, + "reward": 1.9196429252624512, + "reward_std": 0.2272842973470688, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9196428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.22728432714939117, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 308.5, + "completions/mean_terminated_length": 308.5, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4477033757609297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.04701856151223183, + "learning_rate": 1.90822235346483e-05, + "loss": 0.0019, + "num_tokens": 20214624.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 241.875, + "completions/mean_terminated_length": 241.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.4478878435712968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.04325226484797895, + "learning_rate": 1.9080875601912824e-05, + "loss": 0.0017, + "num_tokens": 20220095.0, + "reward": 1.859375, + "reward_std": 0.26252126693725586, + "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, + "rewards/fixed_code_pass_all_test_reward/std": 0.26252126693725586, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 327.875, + "completions/mean_terminated_length": 327.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4480723113816639, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.02845128159970045, + "learning_rate": 1.9079526727732558e-05, + "loss": 0.0011, + "num_tokens": 20229342.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 359.75, + "completions/mean_terminated_length": 359.75, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.448256779192031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.042596359038725495, + "learning_rate": 1.9078176912247336e-05, + "loss": 0.0017, + "num_tokens": 20239964.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 140.125, + "completions/mean_terminated_length": 140.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.44844124700239807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.049838698003441095, + "learning_rate": 1.9076826155597108e-05, + "loss": 0.002, + "num_tokens": 20243925.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 425.5, + "completions/mean_terminated_length": 425.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.4486257148127652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037109375, + "kl": 0.015027020941488445, + "learning_rate": 1.90754744579219e-05, + "loss": 0.0006, + "num_tokens": 20252825.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 180.375, + "completions/mean_terminated_length": 180.375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.44881018262313227, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.06528020044788718, + "learning_rate": 1.9074121819361856e-05, + "loss": 0.0026, + "num_tokens": 20257092.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 297.625, + "completions/mean_terminated_length": 297.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.44899465043349934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0668763059657067, + "learning_rate": 1.907276824005721e-05, + "loss": 0.0027, + "num_tokens": 20263441.0, + "reward": 1.1141304969787598, + "reward_std": 0.3228096067905426, + "rewards/fixed_code_pass_all_test_reward/mean": 0.11413043737411499, + "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 301.5, + "completions/mean_terminated_length": 301.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.44917911824386647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.05478463624604046, + "learning_rate": 1.9071413720148283e-05, + "loss": 0.0022, + "num_tokens": 20273541.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 444.0, + "completions/mean_terminated_length": 444.0, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.44936358605423354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.03800552221946418, + "learning_rate": 1.907005825977551e-05, + "loss": 0.0015, + "num_tokens": 20282245.0, + "reward": 1.8928570747375488, + "reward_std": 0.30304577946662903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 398.0, + "completions/mean_terminated_length": 398.0, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.4495480538646006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.028570421738550067, + "learning_rate": 1.9068701859079408e-05, + "loss": 0.0011, + "num_tokens": 20290149.0, + "reward": 1.9854650497436523, + "reward_std": 0.012036032974720001, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9854651093482971, + "rewards/fixed_code_pass_all_test_reward/std": 0.012036033906042576, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 397.0, + "completions/mean_terminated_length": 397.0, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.44973252167496774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.041548533365130424, + "learning_rate": 1.906734451820061e-05, + "loss": 0.0017, + "num_tokens": 20297957.0, + "reward": 1.1029412746429443, + "reward_std": 0.0415944829583168, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10294117778539658, + "rewards/fixed_code_pass_all_test_reward/std": 0.04159452021121979, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 266.375, + "completions/mean_terminated_length": 266.375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.4499169894853348, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.0731375110335648, + "learning_rate": 1.906598623727983e-05, + "loss": 0.0029, + "num_tokens": 20309536.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 703.0, + "completions/mean_terminated_length": 510.857177734375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.4501014572957019, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80859375, + "kl": 0.03679737285710871, + "learning_rate": 1.9064627016457885e-05, + "loss": 0.0015, + "num_tokens": 20323072.0, + "reward": 1.6628788709640503, + "reward_std": 0.6728243231773376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7878787517547607, + "rewards/fixed_code_pass_all_test_reward/std": 0.320287823677063, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.450285925106069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.058982398360967636, + "learning_rate": 1.9063266855875695e-05, + "loss": 0.0024, + "num_tokens": 20327993.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4504703929164361, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.06672487175092101, + "learning_rate": 1.906190575567427e-05, + "loss": 0.0027, + "num_tokens": 20332233.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 247.25, + "completions/mean_terminated_length": 247.25, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.45065486072680316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0702868674416095, + "learning_rate": 1.9060543715994713e-05, + "loss": 0.0028, + "num_tokens": 20341659.0, + "reward": 1.8990384340286255, + "reward_std": 0.28556233644485474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8990384340286255, + "rewards/fixed_code_pass_all_test_reward/std": 0.2855623662471771, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 238.375, + "completions/mean_terminated_length": 238.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.45083932853717024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.03574440581724048, + "learning_rate": 1.9059180736978242e-05, + "loss": 0.0014, + "num_tokens": 20346766.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 349.875, + "completions/mean_terminated_length": 349.875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.45102379634753736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.027888314565643668, + "learning_rate": 1.9057816818766156e-05, + "loss": 0.0011, + "num_tokens": 20355869.0, + "reward": 1.875, + "reward_std": 0.1649916023015976, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1649915874004364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 115.625, + "completions/mean_terminated_length": 115.625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.45120826415790444, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.07331628212705255, + "learning_rate": 1.905645196149986e-05, + "loss": 0.0029, + "num_tokens": 20359642.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 254.75, + "completions/mean_terminated_length": 254.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.4513927319682715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.08381405915133655, + "learning_rate": 1.905508616532085e-05, + "loss": 0.0034, + "num_tokens": 20369992.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 96.875, + "completions/mean_terminated_length": 96.875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.45157719977863864, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.05653827963396907, + "learning_rate": 1.9053719430370717e-05, + "loss": 0.0023, + "num_tokens": 20373463.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 266.125, + "completions/mean_terminated_length": 266.125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.4517616675890057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.0568434433080256, + "learning_rate": 1.905235175679117e-05, + "loss": 0.0023, + "num_tokens": 20380992.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 352.625, + "completions/mean_terminated_length": 352.625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.4519461353993728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.03464179555885494, + "learning_rate": 1.905098314472399e-05, + "loss": 0.0014, + "num_tokens": 20389621.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.4521306032097399, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26953125, + "kl": 0.06937766401097178, + "learning_rate": 1.9049613594311066e-05, + "loss": 0.0028, + "num_tokens": 20394641.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 318.625, + "completions/mean_terminated_length": 318.625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.452315071020107, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.02606621861923486, + "learning_rate": 1.9048243105694383e-05, + "loss": 0.001, + "num_tokens": 20401158.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 600.75, + "completions/mean_terminated_length": 600.75, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.45249953883047406, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.030177247244864702, + "learning_rate": 1.9046871679016033e-05, + "loss": 0.0012, + "num_tokens": 20414612.0, + "reward": 1.0208332538604736, + "reward_std": 0.012858637608587742, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02083333395421505, + "rewards/fixed_code_pass_all_test_reward/std": 0.012858612462878227, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 469.25, + "completions/mean_terminated_length": 469.25, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.4526840066408412, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.04229787411168218, + "learning_rate": 1.9045499314418186e-05, + "loss": 0.0017, + "num_tokens": 20423582.0, + "reward": 1.567307710647583, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 349.5, + "completions/mean_terminated_length": 349.5, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.45286847445120826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.05416658194735646, + "learning_rate": 1.9044126012043125e-05, + "loss": 0.0022, + "num_tokens": 20432210.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 244.875, + "completions/mean_terminated_length": 244.875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.45305294226157533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.056523178005591035, + "learning_rate": 1.904275177203322e-05, + "loss": 0.0023, + "num_tokens": 20442121.0, + "reward": 1.476190447807312, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4761904776096344, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 428.875, + "completions/mean_terminated_length": 428.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.45323741007194246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.189453125, + "kl": 0.0615214582066983, + "learning_rate": 1.9041376594530953e-05, + "loss": 0.0025, + "num_tokens": 20454408.0, + "reward": 1.2142857313156128, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2142857164144516, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 443.625, + "completions/mean_terminated_length": 443.625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.45342187788230953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04897480271756649, + "learning_rate": 1.9040000479678885e-05, + "loss": 0.002, + "num_tokens": 20465821.0, + "reward": 1.8977272510528564, + "reward_std": 0.11331124603748322, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8977273106575012, + "rewards/fixed_code_pass_all_test_reward/std": 0.11331123113632202, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.4536063456926766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.042797015281394124, + "learning_rate": 1.903862342761968e-05, + "loss": 0.0017, + "num_tokens": 20470677.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 125.125, + "completions/mean_terminated_length": 125.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.45379081350304373, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.703125, + "kl": 0.07229993329383433, + "learning_rate": 1.9037245438496107e-05, + "loss": 0.0029, + "num_tokens": 20474542.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.4539752813134108, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.06182421091943979, + "learning_rate": 1.9035866512451032e-05, + "loss": 0.0025, + "num_tokens": 20483283.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1115.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 768.25, + "completions/mean_terminated_length": 768.25, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "epoch": 0.4541597491237779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.02068014396354556, + "learning_rate": 1.90344866496274e-05, + "loss": 0.0008, + "num_tokens": 20496013.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 566.0, + "completions/mean_terminated_length": 354.2857360839844, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.454344216934145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4375, + "kl": 0.055398496333509684, + "learning_rate": 1.9033105850168273e-05, + "loss": 0.0022, + "num_tokens": 20507501.0, + "reward": 1.6936274766921997, + "reward_std": 0.7022891044616699, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8186274766921997, + "rewards/fixed_code_pass_all_test_reward/std": 0.3664921224117279, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 482.875, + "completions/mean_terminated_length": 482.875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.4545286847445121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.027201927616260946, + "learning_rate": 1.9031724114216804e-05, + "loss": 0.0011, + "num_tokens": 20515948.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 218.625, + "completions/mean_terminated_length": 218.625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.45471315255487915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.05558465304784477, + "learning_rate": 1.903034144191624e-05, + "loss": 0.0022, + "num_tokens": 20523169.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 299.625, + "completions/mean_terminated_length": 299.625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.4548976203652463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.09402292827144265, + "learning_rate": 1.9028957833409933e-05, + "loss": 0.0038, + "num_tokens": 20532198.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 494.875, + "completions/mean_terminated_length": 494.875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.45508208817561335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.052129237446933985, + "learning_rate": 1.9027573288841315e-05, + "loss": 0.0021, + "num_tokens": 20540973.0, + "reward": 1.4826388359069824, + "reward_std": 0.6010971665382385, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6076388359069824, + "rewards/fixed_code_pass_all_test_reward/std": 0.25041303038597107, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 399.625, + "completions/mean_terminated_length": 399.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.4552665559859804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.020885087666101754, + "learning_rate": 1.9026187808353935e-05, + "loss": 0.0008, + "num_tokens": 20548170.0, + "reward": 1.6607142686843872, + "reward_std": 0.716069221496582, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.4040610194206238, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 407.25, + "completions/mean_terminated_length": 407.25, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.45545102379634755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.06685170345008373, + "learning_rate": 1.9024801392091427e-05, + "loss": 0.0027, + "num_tokens": 20557380.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 336.125, + "completions/mean_terminated_length": 336.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.4556354916067146, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.05506485025398433, + "learning_rate": 1.902341404019753e-05, + "loss": 0.0022, + "num_tokens": 20563405.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 221.125, + "completions/mean_terminated_length": 221.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.4558199594170817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.07170854299329221, + "learning_rate": 1.902202575281607e-05, + "loss": 0.0029, + "num_tokens": 20568390.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 253.125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.4560044272274488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.040886704344302416, + "learning_rate": 1.9020636530090976e-05, + "loss": 0.0016, + "num_tokens": 20577903.0, + "reward": 1.8305084705352783, + "reward_std": 0.3495918810367584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8305084705352783, + "rewards/fixed_code_pass_all_test_reward/std": 0.3495918810367584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 236.75, + "completions/mean_terminated_length": 236.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4561888950378159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12255859375, + "kl": 0.06330850347876549, + "learning_rate": 1.901924637216628e-05, + "loss": 0.0025, + "num_tokens": 20583045.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 231.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.456373362848183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.019923260435461998, + "learning_rate": 1.901785527918609e-05, + "loss": 0.0008, + "num_tokens": 20588459.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 306.0, + "completions/mean_terminated_length": 306.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.4565578306585501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.046889389865100384, + "learning_rate": 1.9016463251294644e-05, + "loss": 0.0019, + "num_tokens": 20595523.0, + "reward": 1.8518519401550293, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8518518805503845, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 327.625, + "completions/mean_terminated_length": 327.625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.4567422984689172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.031182186910882592, + "learning_rate": 1.9015070288636243e-05, + "loss": 0.0012, + "num_tokens": 20601832.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 353.375, + "completions/mean_terminated_length": 353.375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.45692676627928425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83984375, + "kl": 0.060395329259335995, + "learning_rate": 1.901367639135531e-05, + "loss": 0.0024, + "num_tokens": 20609243.0, + "reward": 1.735795497894287, + "reward_std": 0.10445894300937653, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7357954382896423, + "rewards/fixed_code_pass_all_test_reward/std": 0.10445895045995712, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 409.75, + "completions/mean_terminated_length": 409.75, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.4571112340896514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7578125, + "kl": 0.05128108547069132, + "learning_rate": 1.9012281559596344e-05, + "loss": 0.0021, + "num_tokens": 20617425.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.45729570190001845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.04223368666134775, + "learning_rate": 1.9010885793503965e-05, + "loss": 0.0017, + "num_tokens": 20621714.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 419.125, + "completions/mean_terminated_length": 419.125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.4574801697103855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71875, + "kl": 0.03437046578619629, + "learning_rate": 1.9009489093222865e-05, + "loss": 0.0014, + "num_tokens": 20630259.0, + "reward": 1.567307710647583, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 290.5, + "completions/mean_terminated_length": 290.5, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.45766463752075265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.0721117933280766, + "learning_rate": 1.9008091458897854e-05, + "loss": 0.0029, + "num_tokens": 20640407.0, + "reward": 1.3199999332427979, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3199999928474426, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 252.625, + "completions/mean_terminated_length": 252.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.4578491053311197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.05214321683160961, + "learning_rate": 1.9006692890673823e-05, + "loss": 0.0021, + "num_tokens": 20648188.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 310.375, + "completions/mean_terminated_length": 310.375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.4580335731414868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.03493988059926778, + "learning_rate": 1.9005293388695772e-05, + "loss": 0.0014, + "num_tokens": 20657671.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.4582180409518539, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.044305355520918965, + "learning_rate": 1.9003892953108788e-05, + "loss": 0.0018, + "num_tokens": 20662021.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 373.375, + "completions/mean_terminated_length": 373.375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.458402508762221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.05712498165667057, + "learning_rate": 1.9002491584058055e-05, + "loss": 0.0023, + "num_tokens": 20669832.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 415.625, + "completions/mean_terminated_length": 415.625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.45858697657258807, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.029951917356811464, + "learning_rate": 1.9001089281688867e-05, + "loss": 0.0012, + "num_tokens": 20676653.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 245.0, + "completions/mean_terminated_length": 245.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.4587714443829552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.06121822237037122, + "learning_rate": 1.8999686046146598e-05, + "loss": 0.0024, + "num_tokens": 20686389.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.45895591219332227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.04804081073962152, + "learning_rate": 1.899828187757673e-05, + "loss": 0.0019, + "num_tokens": 20693246.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 643.75, + "completions/mean_terminated_length": 643.75, + "completions/min_length": 563.0, + "completions/min_terminated_length": 563.0, + "epoch": 0.45914038000368934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.58203125, + "kl": 0.03889798605814576, + "learning_rate": 1.8996876776124836e-05, + "loss": 0.0016, + "num_tokens": 20711900.0, + "reward": 1.8020833730697632, + "reward_std": 0.3240906596183777, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.45932484781405647, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5625, + "kl": 0.09396759839728475, + "learning_rate": 1.8995470741936588e-05, + "loss": 0.0038, + "num_tokens": 20719386.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 305.25, + "completions/mean_terminated_length": 305.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.45950931562442354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05584133695811033, + "learning_rate": 1.8994063775157757e-05, + "loss": 0.0022, + "num_tokens": 20727564.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 338.0, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.4596937834347906, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.03982403548434377, + "learning_rate": 1.89926558759342e-05, + "loss": 0.0016, + "num_tokens": 20737028.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 324.0, + "completions/mean_terminated_length": 324.0, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.45987825124515774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.056834566057659686, + "learning_rate": 1.8991247044411886e-05, + "loss": 0.0023, + "num_tokens": 20747052.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 254.875, + "completions/mean_terminated_length": 254.875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.4600627190555248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.05896364198997617, + "learning_rate": 1.898983728073687e-05, + "loss": 0.0024, + "num_tokens": 20753507.0, + "reward": 1.671875, + "reward_std": 0.3716367185115814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3716367185115814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 414.125, + "completions/mean_terminated_length": 414.125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.4602471868658919, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.07373520731925964, + "learning_rate": 1.8988426585055313e-05, + "loss": 0.0029, + "num_tokens": 20763556.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 363.875, + "completions/mean_terminated_length": 363.875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.460431654676259, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.06801700964570045, + "learning_rate": 1.8987014957513458e-05, + "loss": 0.0027, + "num_tokens": 20777987.0, + "reward": 1.9302325248718262, + "reward_std": 0.04972304776310921, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9302325248718262, + "rewards/fixed_code_pass_all_test_reward/std": 0.04972302168607712, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 257.5, + "completions/mean_terminated_length": 257.5, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4606161224866261, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.027140547521412373, + "learning_rate": 1.898560239825766e-05, + "loss": 0.0011, + "num_tokens": 20784023.0, + "reward": 1.7586207389831543, + "reward_std": 0.09753197431564331, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7586206793785095, + "rewards/fixed_code_pass_all_test_reward/std": 0.09753198176622391, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 438.25, + "completions/mean_terminated_length": 438.25, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.46080059029699316, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "kl": 0.04661717941053212, + "learning_rate": 1.8984188907434356e-05, + "loss": 0.0019, + "num_tokens": 20797905.0, + "reward": 1.5348360538482666, + "reward_std": 0.4236942529678345, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5348360538482666, + "rewards/fixed_code_pass_all_test_reward/std": 0.42369428277015686, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 296.5, + "completions/mean_terminated_length": 296.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.4609850581073603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.10119939292781055, + "learning_rate": 1.8982774485190094e-05, + "loss": 0.004, + "num_tokens": 20806525.0, + "reward": 1.943750023841858, + "reward_std": 0.1590990275144577, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9437500238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.1590990275144577, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.46116952591772736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.03617412189487368, + "learning_rate": 1.8981359131671514e-05, + "loss": 0.0014, + "num_tokens": 20811478.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 345.5, + "completions/mean_terminated_length": 345.5, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.46135399372809444, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.04275578772649169, + "learning_rate": 1.897994284702534e-05, + "loss": 0.0017, + "num_tokens": 20821498.0, + "reward": 1.3829786777496338, + "reward_std": 0.8596827983856201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6329787373542786, + "rewards/fixed_code_pass_all_test_reward/std": 0.4038151800632477, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 256.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.46153846153846156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04971025721170008, + "learning_rate": 1.897852563139841e-05, + "loss": 0.002, + "num_tokens": 20830990.0, + "reward": 1.859375, + "reward_std": 0.2603869140148163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2603869140148163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 228.375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.46172292934882864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.05455705150961876, + "learning_rate": 1.8977107484937652e-05, + "loss": 0.0022, + "num_tokens": 20838673.0, + "reward": 1.8352272510528564, + "reward_std": 0.3562045097351074, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9602272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.016070615500211716, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 407.75, + "completions/mean_terminated_length": 407.75, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.4619073971591957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.028918299009092152, + "learning_rate": 1.8975688407790093e-05, + "loss": 0.0012, + "num_tokens": 20846263.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 284.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.46209186496956284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03379430738277733, + "learning_rate": 1.8974268400102845e-05, + "loss": 0.0014, + "num_tokens": 20855223.0, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 317.625, + "completions/mean_terminated_length": 317.625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4622763327799299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.049331111600622535, + "learning_rate": 1.897284746202313e-05, + "loss": 0.002, + "num_tokens": 20863324.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 555.125, + "completions/mean_terminated_length": 555.125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "epoch": 0.462460800590297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.484375, + "kl": 0.01851571281440556, + "learning_rate": 1.897142559369826e-05, + "loss": 0.0007, + "num_tokens": 20883053.0, + "reward": 1.9153225421905518, + "reward_std": 0.23950394988059998, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9153225421905518, + "rewards/fixed_code_pass_all_test_reward/std": 0.2395039200782776, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 581.5, + "completions/mean_terminated_length": 581.5, + "completions/min_length": 466.0, + "completions/min_terminated_length": 466.0, + "epoch": 0.4626452684006641, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.48828125, + "kl": 0.018425900605507195, + "learning_rate": 1.8970002795275645e-05, + "loss": 0.0007, + "num_tokens": 20894641.0, + "reward": 1.2291666269302368, + "reward_std": 0.3204349875450134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2291666716337204, + "rewards/fixed_code_pass_all_test_reward/std": 0.32043495774269104, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.4628297362110312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.462890625, + "kl": 0.07411512825638056, + "learning_rate": 1.896857906690279e-05, + "loss": 0.003, + "num_tokens": 20904699.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 213.375, + "completions/mean_terminated_length": 213.375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.46301420402139826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.05978736048564315, + "learning_rate": 1.8967154408727303e-05, + "loss": 0.0024, + "num_tokens": 20913742.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 475.25, + "completions/mean_terminated_length": 475.25, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.46319867183176533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.0681481622159481, + "learning_rate": 1.8965728820896882e-05, + "loss": 0.0027, + "num_tokens": 20927560.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.46338313964213246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.056887043407186866, + "learning_rate": 1.8964302303559315e-05, + "loss": 0.0023, + "num_tokens": 20934891.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 424.25, + "completions/mean_terminated_length": 424.25, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.46356760745249953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.05755751917604357, + "learning_rate": 1.89628748568625e-05, + "loss": 0.0023, + "num_tokens": 20946717.0, + "reward": 1.899999976158142, + "reward_std": 0.2828426957130432, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 231.5, + "completions/mean_terminated_length": 231.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.4637520752628666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.042790445033460855, + "learning_rate": 1.8961446480954422e-05, + "loss": 0.0017, + "num_tokens": 20951377.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 643.125, + "completions/mean_terminated_length": 643.125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.46393654307323373, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.68359375, + "kl": 0.02398298680782318, + "learning_rate": 1.896001717598317e-05, + "loss": 0.001, + "num_tokens": 20971778.0, + "reward": 1.5645160675048828, + "reward_std": 0.46555182337760925, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5645161271095276, + "rewards/fixed_code_pass_all_test_reward/std": 0.46555185317993164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 387.25, + "completions/mean_terminated_length": 387.25, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.4641210108836008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.02868594799656421, + "learning_rate": 1.8958586942096922e-05, + "loss": 0.0011, + "num_tokens": 20978836.0, + "reward": 1.7678570747375488, + "reward_std": 0.3657134771347046, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.19839002192020416, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 319.875, + "completions/mean_terminated_length": 319.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.4643054786939679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.050590952625498176, + "learning_rate": 1.8957155779443956e-05, + "loss": 0.002, + "num_tokens": 20985579.0, + "reward": 1.7058823108673096, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823704719543, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 127.5, + "completions/mean_terminated_length": 127.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.464489946504335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.0547384072560817, + "learning_rate": 1.8955723688172645e-05, + "loss": 0.0022, + "num_tokens": 20989423.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 387.25, + "completions/mean_terminated_length": 387.25, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.4646744143147021, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.06283629802055657, + "learning_rate": 1.8954290668431458e-05, + "loss": 0.0025, + "num_tokens": 21001265.0, + "reward": 1.15625, + "reward_std": 0.1293872892856598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15625, + "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 286.625, + "completions/mean_terminated_length": 286.625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.46485888212506915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.06071718130260706, + "learning_rate": 1.895285672036896e-05, + "loss": 0.0024, + "num_tokens": 21009270.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 382.75, + "completions/mean_terminated_length": 382.75, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.4650433499354363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.04056950914673507, + "learning_rate": 1.8951421844133815e-05, + "loss": 0.0016, + "num_tokens": 21021420.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 391.875, + "completions/mean_terminated_length": 391.875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.46522781774580335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.06935037020593882, + "learning_rate": 1.8949986039874782e-05, + "loss": 0.0028, + "num_tokens": 21029115.0, + "reward": 1.9709820747375488, + "reward_std": 0.07511989027261734, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9709821343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.07511986792087555, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4654122855561704, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.1340021677315235, + "learning_rate": 1.8948549307740714e-05, + "loss": 0.0054, + "num_tokens": 21035690.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 586.25, + "completions/mean_terminated_length": 586.25, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.46559675336653755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.06794543284922838, + "learning_rate": 1.8947111647880567e-05, + "loss": 0.0027, + "num_tokens": 21051292.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 305.625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.4657812211769046, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.04875321383588016, + "learning_rate": 1.894567306044338e-05, + "loss": 0.0019, + "num_tokens": 21060889.0, + "reward": 1.9249999523162842, + "reward_std": 0.2121320217847824, + "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 314.25, + "completions/mean_terminated_length": 314.25, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.4659656889872717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.04893139365594834, + "learning_rate": 1.89442335455783e-05, + "loss": 0.002, + "num_tokens": 21071419.0, + "reward": 1.0392156839370728, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.03921568766236305, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.4661501567976388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.03456734656356275, + "learning_rate": 1.8942793103434566e-05, + "loss": 0.0014, + "num_tokens": 21076352.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.4663346246080059, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.25, + "kl": 0.1055802870541811, + "learning_rate": 1.894135173416151e-05, + "loss": 0.0042, + "num_tokens": 21080742.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 376.75, + "completions/mean_terminated_length": 376.75, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.46651909241837297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.0870319688692689, + "learning_rate": 1.8939909437908576e-05, + "loss": 0.0035, + "num_tokens": 21093580.0, + "reward": 1.1749999523162842, + "reward_std": 0.345377653837204, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17499999701976776, + "rewards/fixed_code_pass_all_test_reward/std": 0.3453776240348816, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 528.5, + "completions/mean_terminated_length": 528.5, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 0.4667035602287401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.02256252709776163, + "learning_rate": 1.8938466214825277e-05, + "loss": 0.0009, + "num_tokens": 21103144.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 364.875, + "completions/mean_terminated_length": 364.875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.4668880280391072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.04564733384177089, + "learning_rate": 1.8937022065061246e-05, + "loss": 0.0018, + "num_tokens": 21112031.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.46707249584947425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.06591849634423852, + "learning_rate": 1.8935576988766194e-05, + "loss": 0.0026, + "num_tokens": 21118021.0, + "reward": 1.388157844543457, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5131579041481018, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.4672569636598414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.024012718466110528, + "learning_rate": 1.8934130986089947e-05, + "loss": 0.001, + "num_tokens": 21122225.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 322.25, + "completions/mean_terminated_length": 322.25, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.46744143147020845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.05557045107707381, + "learning_rate": 1.893268405718241e-05, + "loss": 0.0022, + "num_tokens": 21130731.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 523.75, + "completions/mean_terminated_length": 523.75, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.4676258992805755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.046769768465310335, + "learning_rate": 1.8931236202193596e-05, + "loss": 0.0019, + "num_tokens": 21139505.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 408.875, + "completions/mean_terminated_length": 408.875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.46781036709094265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.671875, + "kl": 0.06395402282942086, + "learning_rate": 1.8929787421273606e-05, + "loss": 0.0026, + "num_tokens": 21152872.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 384.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.4679948349013097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.03023316792678088, + "learning_rate": 1.8928337714572638e-05, + "loss": 0.0012, + "num_tokens": 21164488.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 281.375, + "completions/mean_terminated_length": 281.375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.4681793027116768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.03262103453744203, + "learning_rate": 1.892688708224099e-05, + "loss": 0.0013, + "num_tokens": 21173523.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 293.875, + "completions/mean_terminated_length": 293.875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.4683637705220439, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0658258976181969, + "learning_rate": 1.8925435524429058e-05, + "loss": 0.0026, + "num_tokens": 21181130.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 215.5, + "completions/mean_terminated_length": 215.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.468548238332411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.03415358567144722, + "learning_rate": 1.892398304128732e-05, + "loss": 0.0014, + "num_tokens": 21186350.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 344.875, + "completions/mean_terminated_length": 344.875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.46873270614277807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.671875, + "kl": 0.019386149593628943, + "learning_rate": 1.892252963296637e-05, + "loss": 0.0008, + "num_tokens": 21193189.0, + "reward": 1.9444444179534912, + "reward_std": 0.15713484585285187, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, + "rewards/fixed_code_pass_all_test_reward/std": 0.15713483095169067, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 294.25, + "completions/mean_terminated_length": 294.25, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.4689171739531452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26953125, + "kl": 0.08176001068204641, + "learning_rate": 1.892107529961688e-05, + "loss": 0.0033, + "num_tokens": 21201015.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 348.125, + "completions/mean_terminated_length": 348.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.46910164176351227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.020782180363312364, + "learning_rate": 1.8919620041389634e-05, + "loss": 0.0008, + "num_tokens": 21208952.0, + "reward": 1.9107142686843872, + "reward_std": 0.25253817439079285, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 186.25, + "completions/mean_terminated_length": 186.25, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.46928610957387934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.053500829730182886, + "learning_rate": 1.8918163858435498e-05, + "loss": 0.0021, + "num_tokens": 21213610.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 173.625, + "completions/mean_terminated_length": 173.625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.46947057738424647, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.06531019671820104, + "learning_rate": 1.8916706750905436e-05, + "loss": 0.0026, + "num_tokens": 21217943.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 303.5, + "completions/mean_terminated_length": 303.5, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.46965504519461354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.047468837117776275, + "learning_rate": 1.891524871895052e-05, + "loss": 0.0019, + "num_tokens": 21224875.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 281.375, + "completions/mean_terminated_length": 281.375, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4698395130049806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20703125, + "kl": 0.05453839199617505, + "learning_rate": 1.8913789762721898e-05, + "loss": 0.0022, + "num_tokens": 21233950.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 241.125, + "completions/mean_terminated_length": 241.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.47002398081534774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2451171875, + "kl": 0.08024826692417264, + "learning_rate": 1.8912329882370838e-05, + "loss": 0.0032, + "num_tokens": 21240567.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 548.75, + "completions/mean_terminated_length": 548.75, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "epoch": 0.4702084486257148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.03550759796053171, + "learning_rate": 1.891086907804868e-05, + "loss": 0.0014, + "num_tokens": 21254925.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 145.75, + "completions/mean_terminated_length": 145.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4703929164360819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2275390625, + "kl": 0.07591953198425472, + "learning_rate": 1.8909407349906876e-05, + "loss": 0.003, + "num_tokens": 21258955.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 287.5, + "completions/mean_terminated_length": 287.5, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.470577384246449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.07420408120378852, + "learning_rate": 1.890794469809696e-05, + "loss": 0.003, + "num_tokens": 21269687.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 356.25, + "completions/mean_terminated_length": 356.25, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.4707618520568161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.04486829321831465, + "learning_rate": 1.8906481122770586e-05, + "loss": 0.0018, + "num_tokens": 21280233.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 357.125, + "completions/mean_terminated_length": 357.125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.47094631986718316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.08316498715430498, + "learning_rate": 1.8905016624079472e-05, + "loss": 0.0033, + "num_tokens": 21290058.0, + "reward": 1.85326087474823, + "reward_std": 0.1592201292514801, + "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, + "rewards/fixed_code_pass_all_test_reward/std": 0.15922017395496368, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 254.625, + "completions/mean_terminated_length": 254.625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.4711307876775503, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.04232099512591958, + "learning_rate": 1.8903551202175457e-05, + "loss": 0.0017, + "num_tokens": 21298975.0, + "reward": 1.9107143878936768, + "reward_std": 0.20112654566764832, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.2011265754699707, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.47131525548791736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.04296208010055125, + "learning_rate": 1.890208485721046e-05, + "loss": 0.0017, + "num_tokens": 21303685.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 286.625, + "completions/mean_terminated_length": 286.625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.47149972329828443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.048732723109424114, + "learning_rate": 1.890061758933651e-05, + "loss": 0.0019, + "num_tokens": 21310058.0, + "reward": 1.375, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 306.625, + "completions/mean_terminated_length": 306.625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.47168419110865156, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.609375, + "kl": 0.07097500725649297, + "learning_rate": 1.8899149398705714e-05, + "loss": 0.0028, + "num_tokens": 21319199.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 274.875, + "completions/mean_terminated_length": 274.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.47186865891901864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.059380451566539705, + "learning_rate": 1.889768028547029e-05, + "loss": 0.0024, + "num_tokens": 21328142.0, + "reward": 1.6176470518112183, + "reward_std": 0.4111640155315399, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6176470518112183, + "rewards/fixed_code_pass_all_test_reward/std": 0.4111640453338623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 418.125, + "completions/mean_terminated_length": 418.125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.4720531267293857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.06535853166133165, + "learning_rate": 1.8896210249782546e-05, + "loss": 0.0026, + "num_tokens": 21337319.0, + "reward": 1.6875, + "reward_std": 0.22160130739212036, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.22160132229328156, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 258.5, + "completions/mean_terminated_length": 258.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.47223759453975284, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.051073907408863306, + "learning_rate": 1.8894739291794882e-05, + "loss": 0.002, + "num_tokens": 21343499.0, + "reward": 1.7336957454681396, + "reward_std": 0.3419041931629181, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7336956262588501, + "rewards/fixed_code_pass_all_test_reward/std": 0.3419041633605957, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 644.75, + "completions/mean_terminated_length": 644.75, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.4724220623501199, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.0492192980600521, + "learning_rate": 1.88932674116598e-05, + "loss": 0.002, + "num_tokens": 21357785.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 293.5, + "completions/mean_terminated_length": 293.5, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.472606530160487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.08145656669512391, + "learning_rate": 1.889179460952989e-05, + "loss": 0.0033, + "num_tokens": 21368445.0, + "reward": 1.7984694242477417, + "reward_std": 0.18526604771614075, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7984694242477417, + "rewards/fixed_code_pass_all_test_reward/std": 0.18526601791381836, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 382.875, + "completions/mean_terminated_length": 382.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4727909979708541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.03712722868658602, + "learning_rate": 1.8890320885557855e-05, + "loss": 0.0015, + "num_tokens": 21375828.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 305.75, + "completions/mean_terminated_length": 305.75, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.4729754657812212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.03660167055204511, + "learning_rate": 1.8888846239896465e-05, + "loss": 0.0015, + "num_tokens": 21385938.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.47315993359158826, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.04196989326737821, + "learning_rate": 1.8887370672698614e-05, + "loss": 0.0017, + "num_tokens": 21391536.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 164.625, + "completions/mean_terminated_length": 164.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.4733444014019554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.046252730302512646, + "learning_rate": 1.8885894184117267e-05, + "loss": 0.0019, + "num_tokens": 21395797.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 301.625, + "completions/mean_terminated_length": 301.625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.47352886921232246, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.05251197947654873, + "learning_rate": 1.8884416774305508e-05, + "loss": 0.0021, + "num_tokens": 21405546.0, + "reward": 1.8392857313156128, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8392857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 349.25, + "completions/mean_terminated_length": 349.25, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.47371333702268953, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.07412926107645035, + "learning_rate": 1.8882938443416502e-05, + "loss": 0.003, + "num_tokens": 21415300.0, + "reward": 1.4456522464752197, + "reward_std": 0.3935409188270569, + "rewards/fixed_code_pass_all_test_reward/mean": 0.570652186870575, + "rewards/fixed_code_pass_all_test_reward/std": 0.23057831823825836, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 237.0, + "completions/mean_terminated_length": 237.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.47389780483305666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.06294345809146762, + "learning_rate": 1.8881459191603504e-05, + "loss": 0.0025, + "num_tokens": 21425628.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 360.625, + "completions/mean_terminated_length": 360.625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.47408227264342373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.06253949785605073, + "learning_rate": 1.8879979019019886e-05, + "loss": 0.0025, + "num_tokens": 21432945.0, + "reward": 1.60869562625885, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6086956262588501, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 384.625, + "completions/mean_terminated_length": 384.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.4742667404537908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.052997185848653316, + "learning_rate": 1.8878497925819094e-05, + "loss": 0.0021, + "num_tokens": 21438998.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 264.875, + "completions/mean_terminated_length": 264.875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.47445120826415793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.03089377167634666, + "learning_rate": 1.887701591215468e-05, + "loss": 0.0012, + "num_tokens": 21445117.0, + "reward": 1.183333396911621, + "reward_std": 0.329983115196228, + "rewards/fixed_code_pass_all_test_reward/mean": 0.18333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.3299831748008728, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 160.875, + "completions/mean_terminated_length": 160.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.474635676074525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11376953125, + "kl": 0.04278409038670361, + "learning_rate": 1.887553297818029e-05, + "loss": 0.0017, + "num_tokens": 21449140.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 402.875, + "completions/mean_terminated_length": 402.875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.4748201438848921, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.06219977838918567, + "learning_rate": 1.8874049124049662e-05, + "loss": 0.0025, + "num_tokens": 21457003.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 305.125, + "completions/mean_terminated_length": 305.125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.4750046116952592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.060884799575433135, + "learning_rate": 1.8872564349916637e-05, + "loss": 0.0024, + "num_tokens": 21466860.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 383.125, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.4751890795056263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.04024537093937397, + "learning_rate": 1.887107865593514e-05, + "loss": 0.0016, + "num_tokens": 21474885.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 270.625, + "completions/mean_terminated_length": 270.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.47537354731599335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.07099897786974907, + "learning_rate": 1.8869592042259207e-05, + "loss": 0.0028, + "num_tokens": 21484426.0, + "reward": 1.2884615659713745, + "reward_std": 0.4404523968696594, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2884615361690521, + "rewards/fixed_code_pass_all_test_reward/std": 0.44045236706733704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 469.25, + "completions/mean_terminated_length": 469.25, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.4755580151263604, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.0633567743934691, + "learning_rate": 1.886810450904295e-05, + "loss": 0.0025, + "num_tokens": 21497244.0, + "reward": 1.7457627058029175, + "reward_std": 0.7054944038391113, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8707627058029175, + "rewards/fixed_code_pass_all_test_reward/std": 0.35204118490219116, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 298.125, + "completions/mean_terminated_length": 298.125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.47574248293672755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.06306981621310115, + "learning_rate": 1.8866616056440597e-05, + "loss": 0.0025, + "num_tokens": 21508773.0, + "reward": 1.2556817531585693, + "reward_std": 0.16287250816822052, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2556818425655365, + "rewards/fixed_code_pass_all_test_reward/std": 0.16287250816822052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 386.75, + "completions/mean_terminated_length": 386.75, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.4759269507470946, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.07181157660670578, + "learning_rate": 1.886512668460645e-05, + "loss": 0.0029, + "num_tokens": 21520267.0, + "reward": 0.887499988079071, + "reward_std": 0.36030739545822144, + "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, + "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 349.875, + "completions/mean_terminated_length": 349.875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.4761114185574617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.053099709097296, + "learning_rate": 1.8863636393694926e-05, + "loss": 0.0021, + "num_tokens": 21530506.0, + "reward": 1.0714285373687744, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 240.625, + "completions/mean_terminated_length": 240.625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.4762958863678288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.05512688192538917, + "learning_rate": 1.8862145183860522e-05, + "loss": 0.0022, + "num_tokens": 21538647.0, + "reward": 1.928125023841858, + "reward_std": 0.20329320430755615, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9281250238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.20329320430755615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 308.125, + "completions/mean_terminated_length": 308.125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.4764803541781959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.032642005826346576, + "learning_rate": 1.886065305525784e-05, + "loss": 0.0013, + "num_tokens": 21545168.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 393.625, + "completions/mean_terminated_length": 393.625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.47666482198856297, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.04569119121879339, + "learning_rate": 1.8859160008041573e-05, + "loss": 0.0018, + "num_tokens": 21555053.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 330.5, + "completions/mean_terminated_length": 330.5, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.4768492897989301, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.023185471072793007, + "learning_rate": 1.8857666042366512e-05, + "loss": 0.0009, + "num_tokens": 21561681.0, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.47703375760929717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.025671910145319998, + "learning_rate": 1.8856171158387543e-05, + "loss": 0.001, + "num_tokens": 21565977.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 321.0, + "completions/mean_terminated_length": 321.0, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.47721822541966424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.265625, + "kl": 0.07056557713076472, + "learning_rate": 1.885467535625964e-05, + "loss": 0.0028, + "num_tokens": 21574953.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 209.0, + "completions/mean_terminated_length": 209.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.47740269323003137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.06571342574898154, + "learning_rate": 1.885317863613788e-05, + "loss": 0.0026, + "num_tokens": 21583449.0, + "reward": 1.875, + "reward_std": 0.1157275140285492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1157275140285492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 385.75, + "completions/mean_terminated_length": 385.75, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.47758716104039844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.033105006674304605, + "learning_rate": 1.885168099817743e-05, + "loss": 0.0013, + "num_tokens": 21591383.0, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 363.625, + "completions/mean_terminated_length": 363.625, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.4777716288507655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.029328880831599236, + "learning_rate": 1.8850182442533568e-05, + "loss": 0.0012, + "num_tokens": 21600156.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 790.75, + "completions/mean_terminated_length": 790.75, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "epoch": 0.47795609666113265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.52734375, + "kl": 0.013846810557879508, + "learning_rate": 1.8848682969361637e-05, + "loss": 0.0006, + "num_tokens": 21617090.0, + "reward": 1.2083333730697632, + "reward_std": 1.006920576095581, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5833333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.49601587653160095, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 380.625, + "completions/mean_terminated_length": 380.625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.4781405644714997, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.03622040431946516, + "learning_rate": 1.8847182578817104e-05, + "loss": 0.0014, + "num_tokens": 21625631.0, + "reward": 1.71875, + "reward_std": 0.38816189765930176, + "rewards/fixed_code_pass_all_test_reward/mean": 0.71875, + "rewards/fixed_code_pass_all_test_reward/std": 0.38816189765930176, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 182.375, + "completions/mean_terminated_length": 182.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.4783250322818668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.875, + "kl": 0.11300290073268116, + "learning_rate": 1.884568127105552e-05, + "loss": 0.0045, + "num_tokens": 21632650.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 237.875, + "completions/mean_terminated_length": 237.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.4785095000922339, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.04447584296576679, + "learning_rate": 1.884417904623252e-05, + "loss": 0.0018, + "num_tokens": 21638633.0, + "reward": 1.0612244606018066, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.06122449040412903, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 231.875, + "completions/mean_terminated_length": 231.875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.478693967902601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.0870233066380024, + "learning_rate": 1.8842675904503855e-05, + "loss": 0.0035, + "num_tokens": 21644816.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.47887843571296806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.08226994518190622, + "learning_rate": 1.8841171846025353e-05, + "loss": 0.0033, + "num_tokens": 21649315.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 167.0, + "completions/mean_terminated_length": 167.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.4790629035233352, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.068166937911883, + "learning_rate": 1.883966687095295e-05, + "loss": 0.0027, + "num_tokens": 21656883.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 392.625, + "completions/mean_terminated_length": 392.625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.47924737133370227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.038940017111599445, + "learning_rate": 1.8838160979442675e-05, + "loss": 0.0016, + "num_tokens": 21665072.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 247.75, + "completions/mean_terminated_length": 247.75, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.47943183914406934, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.0648960149846971, + "learning_rate": 1.883665417165064e-05, + "loss": 0.0026, + "num_tokens": 21672630.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 299.0, + "completions/mean_terminated_length": 299.0, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.47961630695443647, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.036986818769946694, + "learning_rate": 1.883514644773307e-05, + "loss": 0.0015, + "num_tokens": 21683542.0, + "reward": 1.4036458730697632, + "reward_std": 0.0989387258887291, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4036458134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.09893874824047089, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 266.125, + "completions/mean_terminated_length": 266.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.47980077476480354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.048862273804843426, + "learning_rate": 1.8833637807846266e-05, + "loss": 0.002, + "num_tokens": 21688903.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.4799852425751706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.029547258280217648, + "learning_rate": 1.883212825214664e-05, + "loss": 0.0012, + "num_tokens": 21693887.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 201.0, + "completions/mean_terminated_length": 201.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.48016971038553774, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.08336235024034977, + "learning_rate": 1.8830617780790694e-05, + "loss": 0.0033, + "num_tokens": 21702599.0, + "reward": 1.7827380895614624, + "reward_std": 0.17995494604110718, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7827380895614624, + "rewards/fixed_code_pass_all_test_reward/std": 0.17995496094226837, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 154.125, + "completions/mean_terminated_length": 154.125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.4803541781959048, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.051262628054246306, + "learning_rate": 1.8829106393935016e-05, + "loss": 0.0021, + "num_tokens": 21706760.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 195.0, + "completions/mean_terminated_length": 195.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.4805386460062719, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.03332006628625095, + "learning_rate": 1.8827594091736307e-05, + "loss": 0.0013, + "num_tokens": 21711160.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 262.625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.480723113816639, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.06753552565351129, + "learning_rate": 1.8826080874351343e-05, + "loss": 0.0027, + "num_tokens": 21717845.0, + "reward": 1.1612902879714966, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.16129031777381897, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.4809075816270061, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.031605039490386844, + "learning_rate": 1.8824566741937012e-05, + "loss": 0.0013, + "num_tokens": 21727823.0, + "reward": 1.5, + "reward_std": 0.14304131269454956, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.1430412381887436, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 127.625, + "completions/mean_terminated_length": 127.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.48109204943737316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.021539915003813803, + "learning_rate": 1.8823051694650282e-05, + "loss": 0.0009, + "num_tokens": 21731668.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 500.0, + "completions/mean_terminated_length": 278.8571472167969, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.4812765172477403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.06381104308820795, + "learning_rate": 1.8821535732648227e-05, + "loss": 0.0026, + "num_tokens": 21739964.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 231.75, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.48146098505810736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.06739085214212537, + "learning_rate": 1.8820018856088013e-05, + "loss": 0.0027, + "num_tokens": 21747042.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 250.625, + "completions/mean_terminated_length": 250.625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.48164545286847443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.06153931561857462, + "learning_rate": 1.88185010651269e-05, + "loss": 0.0025, + "num_tokens": 21757191.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 424.625, + "completions/mean_terminated_length": 424.625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.48182992067884156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.07224446663167328, + "learning_rate": 1.8816982359922236e-05, + "loss": 0.0029, + "num_tokens": 21767052.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 322.625, + "completions/mean_terminated_length": 322.625, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.48201438848920863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.07759394031018019, + "learning_rate": 1.881546274063148e-05, + "loss": 0.0031, + "num_tokens": 21778697.0, + "reward": 1.2217742204666138, + "reward_std": 0.19541148841381073, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2217741757631302, + "rewards/fixed_code_pass_all_test_reward/std": 0.1954115331172943, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 263.0, + "completions/mean_terminated_length": 263.0, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.4821988562995757, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.0468183069024235, + "learning_rate": 1.881394220741217e-05, + "loss": 0.0019, + "num_tokens": 21788897.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 198.875, + "completions/mean_terminated_length": 198.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.48238332410994283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.05207995884120464, + "learning_rate": 1.8812420760421942e-05, + "loss": 0.0021, + "num_tokens": 21793448.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 257.25, + "completions/mean_terminated_length": 257.25, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.4825677919203099, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0718965525738895, + "learning_rate": 1.8810898399818532e-05, + "loss": 0.0029, + "num_tokens": 21803458.0, + "reward": 1.5, + "reward_std": 0.4750940203666687, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.4750939607620239, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 645.75, + "completions/mean_terminated_length": 645.75, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.482752259730677, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.059276743326336145, + "learning_rate": 1.8809375125759776e-05, + "loss": 0.0024, + "num_tokens": 21819176.0, + "reward": 1.2041666507720947, + "reward_std": 0.3470682203769684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20416668057441711, + "rewards/fixed_code_pass_all_test_reward/std": 0.34706825017929077, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 175.875, + "completions/mean_terminated_length": 175.875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.4829367275410441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.038110974011942744, + "learning_rate": 1.8807850938403587e-05, + "loss": 0.0015, + "num_tokens": 21823423.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 281.0, + "completions/mean_terminated_length": 281.0, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.4831211953514112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.05661628535017371, + "learning_rate": 1.8806325837907985e-05, + "loss": 0.0023, + "num_tokens": 21829695.0, + "reward": 1.433823585510254, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43382352590560913, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 532.0, + "completions/mean_terminated_length": 315.4285888671875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.48330566316177825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "kl": 0.05484827782493085, + "learning_rate": 1.8804799824431083e-05, + "loss": 0.0022, + "num_tokens": 21841871.0, + "reward": 1.4427082538604736, + "reward_std": 0.7130228281021118, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5677083134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.47031813859939575, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 442.375, + "completions/mean_terminated_length": 442.375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.4834901309721454, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.045658388757146895, + "learning_rate": 1.880327289813109e-05, + "loss": 0.0018, + "num_tokens": 21851698.0, + "reward": 1.6666667461395264, + "reward_std": 0.17817415297031403, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 285.125, + "completions/mean_terminated_length": 285.125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.48367459878251245, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.0672224098816514, + "learning_rate": 1.8801745059166305e-05, + "loss": 0.0027, + "num_tokens": 21860387.0, + "reward": 1.1029411554336548, + "reward_std": 0.5567187070846558, + "rewards/fixed_code_pass_all_test_reward/mean": 0.22794118523597717, + "rewards/fixed_code_pass_all_test_reward/std": 0.34613537788391113, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.4838590665928795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.05607604747638106, + "learning_rate": 1.8800216307695123e-05, + "loss": 0.0022, + "num_tokens": 21870177.0, + "reward": 1.0178570747375488, + "reward_std": 0.05050762742757797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 298.0, + "completions/mean_terminated_length": 298.0, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.48404353440324666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.06874278502073139, + "learning_rate": 1.8798686643876037e-05, + "loss": 0.0027, + "num_tokens": 21880145.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 130.375, + "completions/mean_terminated_length": 130.375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.48422800221361373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.050453036557883024, + "learning_rate": 1.8797156067867637e-05, + "loss": 0.002, + "num_tokens": 21883852.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 307.25, + "completions/mean_terminated_length": 307.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4844124700239808, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.08801545714959502, + "learning_rate": 1.8795624579828593e-05, + "loss": 0.0035, + "num_tokens": 21892582.0, + "reward": 1.6351351737976074, + "reward_std": 0.5035613775253296, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6351351737976074, + "rewards/fixed_code_pass_all_test_reward/std": 0.5035613775253296, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 319.625, + "completions/mean_terminated_length": 319.625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.48459693783434793, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.048155746422708035, + "learning_rate": 1.8794092179917687e-05, + "loss": 0.0019, + "num_tokens": 21899595.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 190.875, + "completions/mean_terminated_length": 190.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.484781405644715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.06102281576022506, + "learning_rate": 1.879255886829378e-05, + "loss": 0.0024, + "num_tokens": 21903954.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 237.25, + "completions/mean_terminated_length": 237.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.4849658734550821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.10516053065657616, + "learning_rate": 1.879102464511585e-05, + "loss": 0.0042, + "num_tokens": 21912524.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 505.5, + "completions/mean_terminated_length": 505.5, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.4851503412654492, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.05615597194992006, + "learning_rate": 1.878948951054294e-05, + "loss": 0.0022, + "num_tokens": 21921848.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 169.25, + "completions/mean_terminated_length": 169.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.4853348090758163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.357421875, + "kl": 0.06627262197434902, + "learning_rate": 1.878795346473421e-05, + "loss": 0.0027, + "num_tokens": 21925962.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.48551927688618335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.06143044214695692, + "learning_rate": 1.8786416507848904e-05, + "loss": 0.0025, + "num_tokens": 21934082.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 188.625, + "completions/mean_terminated_length": 188.625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.4857037446965505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.06379149632994086, + "learning_rate": 1.8784878640046366e-05, + "loss": 0.0026, + "num_tokens": 21938359.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 241.25, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.48588821250691755, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.71875, + "kl": 0.11808006651699543, + "learning_rate": 1.8783339861486033e-05, + "loss": 0.0047, + "num_tokens": 21946057.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.4860726803172846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.06879752920940518, + "learning_rate": 1.8781800172327434e-05, + "loss": 0.0028, + "num_tokens": 21952371.0, + "reward": 1.83695650100708, + "reward_std": 0.2250213772058487, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8369565010070801, + "rewards/fixed_code_pass_all_test_reward/std": 0.2250213921070099, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 331.375, + "completions/mean_terminated_length": 331.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.48625714812765175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.06551646068692207, + "learning_rate": 1.8780259572730192e-05, + "loss": 0.0026, + "num_tokens": 21959270.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 230.875, + "completions/mean_terminated_length": 230.875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.4864416159380188, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21484375, + "kl": 0.09365212917327881, + "learning_rate": 1.8778718062854025e-05, + "loss": 0.0037, + "num_tokens": 21967357.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 240.25, + "completions/mean_terminated_length": 240.25, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4866260837483859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.166015625, + "kl": 0.07621316704899073, + "learning_rate": 1.877717564285875e-05, + "loss": 0.003, + "num_tokens": 21973271.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 246.125, + "completions/mean_terminated_length": 246.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.486810551558753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.04965034918859601, + "learning_rate": 1.8775632312904272e-05, + "loss": 0.002, + "num_tokens": 21982552.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 200.625, + "completions/mean_terminated_length": 200.625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.4869950193691201, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.421875, + "kl": 0.07729699392803013, + "learning_rate": 1.87740880731506e-05, + "loss": 0.0031, + "num_tokens": 21987213.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 355.25, + "completions/mean_terminated_length": 355.25, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.48717948717948717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.06013509491458535, + "learning_rate": 1.877254292375782e-05, + "loss": 0.0024, + "num_tokens": 21999343.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.4873639549898543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.027457835152745247, + "learning_rate": 1.877099686488613e-05, + "loss": 0.0011, + "num_tokens": 22005673.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 310.375, + "completions/mean_terminated_length": 310.375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.48754842280022137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.08892476977780461, + "learning_rate": 1.8769449896695815e-05, + "loss": 0.0036, + "num_tokens": 22014396.0, + "reward": 1.875, + "reward_std": 0.14608041942119598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.1460804045200348, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 137.375, + "completions/mean_terminated_length": 137.375, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.48773289061058844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.04619123344309628, + "learning_rate": 1.8767902019347248e-05, + "loss": 0.0018, + "num_tokens": 22018423.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 398.875, + "completions/mean_terminated_length": 398.875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.4879173584209555, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.08320645708590746, + "learning_rate": 1.876635323300091e-05, + "loss": 0.0033, + "num_tokens": 22028726.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 419.0, + "completions/mean_terminated_length": 419.0, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.48810182623132264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.0428480498958379, + "learning_rate": 1.8764803537817368e-05, + "loss": 0.0017, + "num_tokens": 22040734.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 219.125, + "completions/mean_terminated_length": 219.125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.4882862940416897, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.07826476637274027, + "learning_rate": 1.876325293395728e-05, + "loss": 0.0031, + "num_tokens": 22047567.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 113.5, + "completions/mean_terminated_length": 113.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.4884707618520568, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.04317105608060956, + "learning_rate": 1.8761701421581403e-05, + "loss": 0.0017, + "num_tokens": 22051403.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.4886552296624239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.05084071250166744, + "learning_rate": 1.876014900085059e-05, + "loss": 0.002, + "num_tokens": 22061123.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 389.125, + "completions/mean_terminated_length": 389.125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.488839697472791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.05006074230186641, + "learning_rate": 1.8758595671925785e-05, + "loss": 0.002, + "num_tokens": 22069356.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 298.25, + "completions/mean_terminated_length": 298.25, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.48902416528315806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.04425408411771059, + "learning_rate": 1.875704143496803e-05, + "loss": 0.0018, + "num_tokens": 22075998.0, + "reward": 1.082446813583374, + "reward_std": 0.08274652063846588, + "rewards/fixed_code_pass_all_test_reward/mean": 0.08244681358337402, + "rewards/fixed_code_pass_all_test_reward/std": 0.08274653553962708, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 409.875, + "completions/mean_terminated_length": 409.875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.4892086330935252, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.07869835942983627, + "learning_rate": 1.8755486290138446e-05, + "loss": 0.0031, + "num_tokens": 22083733.0, + "reward": 1.64673912525177, + "reward_std": 0.12821611762046814, + "rewards/fixed_code_pass_all_test_reward/mean": 0.64673912525177, + "rewards/fixed_code_pass_all_test_reward/std": 0.12821611762046814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 320.375, + "completions/mean_terminated_length": 320.375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.48939310090389226, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0628776231314987, + "learning_rate": 1.8753930237598273e-05, + "loss": 0.0025, + "num_tokens": 22092944.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 613.25, + "completions/mean_terminated_length": 613.25, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "epoch": 0.48957756871425934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.703125, + "kl": 0.02920834848191589, + "learning_rate": 1.8752373277508827e-05, + "loss": 0.0012, + "num_tokens": 22113202.0, + "reward": 1.6572580337524414, + "reward_std": 0.4796862006187439, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7822580337524414, + "rewards/fixed_code_pass_all_test_reward/std": 0.4090364873409271, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 266.5, + "completions/mean_terminated_length": 266.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.48976203652462647, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.08292871667072177, + "learning_rate": 1.8750815410031527e-05, + "loss": 0.0033, + "num_tokens": 22123230.0, + "reward": 1.7805233001708984, + "reward_std": 0.41159871220588684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7805232405662537, + "rewards/fixed_code_pass_all_test_reward/std": 0.41159871220588684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 199.625, + "completions/mean_terminated_length": 199.625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.48994650433499354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.07881693355739117, + "learning_rate": 1.8749256635327877e-05, + "loss": 0.0032, + "num_tokens": 22128475.0, + "reward": 1.329545497894287, + "reward_std": 0.13690368831157684, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3295454680919647, + "rewards/fixed_code_pass_all_test_reward/std": 0.13690368831157684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 251.75, + "completions/mean_terminated_length": 251.75, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4901309721453606, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03369140625, + "kl": 0.019764596479944885, + "learning_rate": 1.8747696953559483e-05, + "loss": 0.0008, + "num_tokens": 22134569.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 396.0, + "completions/mean_terminated_length": 396.0, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.49031543995572774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.053213195176795125, + "learning_rate": 1.874613636488804e-05, + "loss": 0.0021, + "num_tokens": 22142153.0, + "reward": 1.6634615659713745, + "reward_std": 0.4740743041038513, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7884615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.40023237466812134, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 391.0, + "completions/mean_terminated_length": 391.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.4904999077660948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.058236134704202414, + "learning_rate": 1.8744574869475345e-05, + "loss": 0.0023, + "num_tokens": 22151105.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 280.875, + "completions/mean_terminated_length": 280.875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.4906843755764619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.08595698280259967, + "learning_rate": 1.8743012467483277e-05, + "loss": 0.0034, + "num_tokens": 22160064.0, + "reward": 1.44921875, + "reward_std": 0.2608886957168579, + "rewards/fixed_code_pass_all_test_reward/mean": 0.44921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2608887255191803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 321.375, + "completions/mean_terminated_length": 321.375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.490868843386829, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.05344819067977369, + "learning_rate": 1.8741449159073818e-05, + "loss": 0.0021, + "num_tokens": 22166683.0, + "reward": 1.767241358757019, + "reward_std": 0.38519835472106934, + "rewards/fixed_code_pass_all_test_reward/mean": 0.767241358757019, + "rewards/fixed_code_pass_all_test_reward/std": 0.38519835472106934, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 404.5, + "completions/mean_terminated_length": 404.5, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.4910533111971961, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.05596211808733642, + "learning_rate": 1.8739884944409044e-05, + "loss": 0.0022, + "num_tokens": 22174471.0, + "reward": 1.5, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.49123777900756316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.04104274197015911, + "learning_rate": 1.873831982365112e-05, + "loss": 0.0016, + "num_tokens": 22182777.0, + "reward": 1.8333333730697632, + "reward_std": 0.17817412316799164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 446.375, + "completions/mean_terminated_length": 446.375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.4914222468179303, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04835917765740305, + "learning_rate": 1.8736753796962307e-05, + "loss": 0.0019, + "num_tokens": 22195556.0, + "reward": 1.7872023582458496, + "reward_std": 0.004209025297313929, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7872023582458496, + "rewards/fixed_code_pass_all_test_reward/std": 0.004208974074572325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 247.75, + "completions/mean_terminated_length": 247.75, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.49160671462829736, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.06592613132670522, + "learning_rate": 1.8735186864504958e-05, + "loss": 0.0026, + "num_tokens": 22204170.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.49179118243866443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.08524602698162198, + "learning_rate": 1.873361902644153e-05, + "loss": 0.0034, + "num_tokens": 22211321.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 418.75, + "completions/mean_terminated_length": 418.75, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.49197565024903156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.030290422961115837, + "learning_rate": 1.8732050282934557e-05, + "loss": 0.0012, + "num_tokens": 22219983.0, + "reward": 1.024999976158142, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.02500000037252903, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 425.875, + "completions/mean_terminated_length": 425.875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.49216011805939863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.04072719987016171, + "learning_rate": 1.873048063414668e-05, + "loss": 0.0016, + "num_tokens": 22231966.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 282.875, + "completions/mean_terminated_length": 282.875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4923445858697657, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.0708764863666147, + "learning_rate": 1.872891008024063e-05, + "loss": 0.0028, + "num_tokens": 22239565.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 556.125, + "completions/mean_terminated_length": 556.125, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.49252905368013283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6171875, + "kl": 0.06992996204644442, + "learning_rate": 1.8727338621379233e-05, + "loss": 0.0028, + "num_tokens": 22252582.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 268.375, + "completions/mean_terminated_length": 268.375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4927135214904999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.05021062935702503, + "learning_rate": 1.8725766257725403e-05, + "loss": 0.002, + "num_tokens": 22258825.0, + "reward": 1.9663461446762085, + "reward_std": 0.0951874852180481, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9663461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.09518745541572571, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 370.375, + "completions/mean_terminated_length": 370.375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.492897989300867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.03130536514800042, + "learning_rate": 1.8724192989442155e-05, + "loss": 0.0013, + "num_tokens": 22265516.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.4930824571112341, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.06410781713202596, + "learning_rate": 1.8722618816692598e-05, + "loss": 0.0026, + "num_tokens": 22269854.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 584.125, + "completions/mean_terminated_length": 584.125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "epoch": 0.4932669249216012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "kl": 0.04553838772699237, + "learning_rate": 1.8721043739639927e-05, + "loss": 0.0018, + "num_tokens": 22285527.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 351.125, + "completions/mean_terminated_length": 351.125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.49345139273196825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.023106445791199803, + "learning_rate": 1.8719467758447435e-05, + "loss": 0.0009, + "num_tokens": 22292680.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4936358605423354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.0685864114202559, + "learning_rate": 1.8717890873278512e-05, + "loss": 0.0027, + "num_tokens": 22302086.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 474.875, + "completions/mean_terminated_length": 474.875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.49382032835270245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3125, + "kl": 0.047360674012452364, + "learning_rate": 1.871631308429664e-05, + "loss": 0.0019, + "num_tokens": 22308861.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 274.375, + "completions/mean_terminated_length": 274.375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.4940047961630695, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.08060811553150415, + "learning_rate": 1.871473439166539e-05, + "loss": 0.0032, + "num_tokens": 22322808.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 271.25, + "completions/mean_terminated_length": 271.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.49418926397343665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.056226518703624606, + "learning_rate": 1.871315479554843e-05, + "loss": 0.0022, + "num_tokens": 22332698.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 346.125, + "completions/mean_terminated_length": 346.125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.4943737317838037, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.02856296324171126, + "learning_rate": 1.871157429610953e-05, + "loss": 0.0011, + "num_tokens": 22340163.0, + "reward": 1.7232143878936768, + "reward_std": 0.19631260633468628, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7232142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.19631259143352509, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 370.375, + "completions/mean_terminated_length": 370.375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.4945581995941708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.058237980119884014, + "learning_rate": 1.8709992893512535e-05, + "loss": 0.0023, + "num_tokens": 22347654.0, + "reward": 1.8571429252624512, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.49474266740453793, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.07689151540398598, + "learning_rate": 1.87084105879214e-05, + "loss": 0.0031, + "num_tokens": 22356111.0, + "reward": 1.2337963581085205, + "reward_std": 0.12557433545589447, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23379631340503693, + "rewards/fixed_code_pass_all_test_reward/std": 0.12557432055473328, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 169.875, + "completions/mean_terminated_length": 169.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.494927135214905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.06833068933337927, + "learning_rate": 1.8706827379500172e-05, + "loss": 0.0027, + "num_tokens": 22360270.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.4951116030252721, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.052572570042684674, + "learning_rate": 1.8705243268412977e-05, + "loss": 0.0021, + "num_tokens": 22366507.0, + "reward": 1.8068182468414307, + "reward_std": 0.1347913146018982, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8068181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.1347913295030594, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 224.875, + "completions/mean_terminated_length": 224.875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.4952960708356392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.037722908426076174, + "learning_rate": 1.8703658254824052e-05, + "loss": 0.0015, + "num_tokens": 22371562.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 360.375, + "completions/mean_terminated_length": 360.375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.4954805386460063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.051082147285342216, + "learning_rate": 1.8702072338897725e-05, + "loss": 0.002, + "num_tokens": 22379389.0, + "reward": 1.0078125, + "reward_std": 0.022097086533904076, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0078125, + "rewards/fixed_code_pass_all_test_reward/std": 0.022097086533904076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 313.625, + "completions/mean_terminated_length": 313.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.49566500645637335, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.037178396014496684, + "learning_rate": 1.87004855207984e-05, + "loss": 0.0015, + "num_tokens": 22389458.0, + "reward": 1.6720588207244873, + "reward_std": 0.09566734731197357, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6720588207244873, + "rewards/fixed_code_pass_all_test_reward/std": 0.09566739946603775, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 325.125, + "completions/mean_terminated_length": 325.125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.4958494742667405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.029861239483579993, + "learning_rate": 1.86988978006906e-05, + "loss": 0.0012, + "num_tokens": 22396219.0, + "reward": 1.9500000476837158, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 184.5, + "completions/mean_terminated_length": 184.5, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.49603394207710755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.031399325118400156, + "learning_rate": 1.8697309178738923e-05, + "loss": 0.0013, + "num_tokens": 22400727.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 371.0, + "completions/mean_terminated_length": 371.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.4962184098874746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.053763410774990916, + "learning_rate": 1.8695719655108068e-05, + "loss": 0.0022, + "num_tokens": 22408335.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 377.875, + "completions/mean_terminated_length": 377.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.49640287769784175, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.05716722644865513, + "learning_rate": 1.869412922996283e-05, + "loss": 0.0023, + "num_tokens": 22419886.0, + "reward": 1.3776042461395264, + "reward_std": 0.1653360277414322, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3776041865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.1653360277414322, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 449.75, + "completions/mean_terminated_length": 449.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.4965873455082088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.04980372806312516, + "learning_rate": 1.8692537903468085e-05, + "loss": 0.002, + "num_tokens": 22433196.0, + "reward": 1.921875, + "reward_std": 0.0646936446428299, + "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.06469365209341049, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 178.625, + "completions/mean_terminated_length": 178.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.4967718133185759, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9140625, + "kl": 0.05375754344277084, + "learning_rate": 1.869094567578882e-05, + "loss": 0.0022, + "num_tokens": 22437601.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 457.0, + "completions/mean_terminated_length": 457.0, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.496956281128943, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.06520343385636806, + "learning_rate": 1.86893525470901e-05, + "loss": 0.0026, + "num_tokens": 22451169.0, + "reward": 1.5654761791229248, + "reward_std": 0.15573996305465698, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5654761791229248, + "rewards/fixed_code_pass_all_test_reward/std": 0.15573999285697937, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 355.5, + "completions/mean_terminated_length": 355.5, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.4971407489393101, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.03270853543654084, + "learning_rate": 1.86877585175371e-05, + "loss": 0.0013, + "num_tokens": 22461629.0, + "reward": 1.852150559425354, + "reward_std": 0.3175621032714844, + "rewards/fixed_code_pass_all_test_reward/mean": 0.852150559425354, + "rewards/fixed_code_pass_all_test_reward/std": 0.317562073469162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 368.875, + "completions/mean_terminated_length": 368.875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.49732521674967717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.03567098663188517, + "learning_rate": 1.8686163587295064e-05, + "loss": 0.0014, + "num_tokens": 22468332.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 216.375, + "completions/mean_terminated_length": 216.375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.4975096845600443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.03171967202797532, + "learning_rate": 1.8684567756529354e-05, + "loss": 0.0013, + "num_tokens": 22473119.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 266.5, + "completions/mean_terminated_length": 266.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.49769415237041137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.07879601046442986, + "learning_rate": 1.8682971025405413e-05, + "loss": 0.0032, + "num_tokens": 22479475.0, + "reward": 1.53125, + "reward_std": 0.4078085720539093, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3735266327857971, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 443.0, + "completions/mean_terminated_length": 443.0, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.49787862018077844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.025188424857333302, + "learning_rate": 1.868137339408878e-05, + "loss": 0.001, + "num_tokens": 22488027.0, + "reward": 1.2000000476837158, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 282.5, + "completions/mean_terminated_length": 282.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.49806308799114557, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.609375, + "kl": 0.03303690068423748, + "learning_rate": 1.8679774862745082e-05, + "loss": 0.0013, + "num_tokens": 22493967.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 172.875, + "completions/mean_terminated_length": 172.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.49824755580151264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.03930973447859287, + "learning_rate": 1.8678175431540053e-05, + "loss": 0.0016, + "num_tokens": 22498126.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 249.5, + "completions/mean_terminated_length": 249.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4984320236118797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2333984375, + "kl": 0.0703016750048846, + "learning_rate": 1.86765751006395e-05, + "loss": 0.0028, + "num_tokens": 22507498.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 220.5, + "completions/mean_terminated_length": 220.5, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.49861649142224684, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.06452929903753102, + "learning_rate": 1.8674973870209344e-05, + "loss": 0.0026, + "num_tokens": 22512158.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 291.5, + "completions/mean_terminated_length": 291.5, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.4988009592326139, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.017709423671476543, + "learning_rate": 1.8673371740415586e-05, + "loss": 0.0007, + "num_tokens": 22518378.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.498985427042981, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.07703635562211275, + "learning_rate": 1.8671768711424326e-05, + "loss": 0.0031, + "num_tokens": 22526774.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 312.625, + "completions/mean_terminated_length": 312.625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.4991698948533481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.04877238115295768, + "learning_rate": 1.8670164783401753e-05, + "loss": 0.002, + "num_tokens": 22535587.0, + "reward": 1.9425675868988037, + "reward_std": 0.16244345903396606, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9425675868988037, + "rewards/fixed_code_pass_all_test_reward/std": 0.16244345903396606, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 359.0, + "completions/mean_terminated_length": 359.0, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.4993543626637152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.671875, + "kl": 0.030382052063941956, + "learning_rate": 1.8668559956514155e-05, + "loss": 0.0012, + "num_tokens": 22542307.0, + "reward": 1.921875, + "reward_std": 0.22097086906433105, + "rewards/fixed_code_pass_all_test_reward/mean": 0.921875, + "rewards/fixed_code_pass_all_test_reward/std": 0.22097088396549225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 288.0, + "completions/mean_terminated_length": 288.0, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.49953883047408226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.06432184716686606, + "learning_rate": 1.8666954230927904e-05, + "loss": 0.0026, + "num_tokens": 22548715.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 331.625, + "completions/mean_terminated_length": 331.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.4997232982844494, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.051102206809446216, + "learning_rate": 1.8665347606809475e-05, + "loss": 0.002, + "num_tokens": 22561720.0, + "reward": 1.454741358757019, + "reward_std": 0.7262423634529114, + "rewards/fixed_code_pass_all_test_reward/mean": 0.579741358757019, + "rewards/fixed_code_pass_all_test_reward/std": 0.4866082966327667, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 247.25, + "completions/mean_terminated_length": 247.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.49990776609481646, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.03672591829672456, + "learning_rate": 1.866374008432543e-05, + "loss": 0.0015, + "num_tokens": 22569538.0, + "reward": 1.968181848526001, + "reward_std": 0.08999539911746979, + "rewards/fixed_code_pass_all_test_reward/mean": 0.968181848526001, + "rewards/fixed_code_pass_all_test_reward/std": 0.08999541401863098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 295.25, + "completions/mean_terminated_length": 295.25, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.5000922339051835, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.6875, + "kl": 0.2124472400173545, + "learning_rate": 1.8662131663642437e-05, + "loss": 0.0085, + "num_tokens": 22576156.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.5002767017155506, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.0659384629689157, + "learning_rate": 1.866052234492723e-05, + "loss": 0.0026, + "num_tokens": 22585643.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 224.75, + "completions/mean_terminated_length": 224.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.5004611695259177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.028452370315790176, + "learning_rate": 1.865891212834666e-05, + "loss": 0.0011, + "num_tokens": 22590521.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 428.0, + "completions/mean_terminated_length": 428.0, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.5006456373362849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.028288972564041615, + "learning_rate": 1.8657301014067663e-05, + "loss": 0.0011, + "num_tokens": 22602601.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.5008301051466519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.057589816860854626, + "learning_rate": 1.865568900225727e-05, + "loss": 0.0023, + "num_tokens": 22610920.0, + "reward": 1.7857143878936768, + "reward_std": 0.3926767408847809, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.23389144241809845, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 115.125, + "completions/mean_terminated_length": 115.125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.501014572957019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.421875, + "kl": 0.06050415366189554, + "learning_rate": 1.86540760930826e-05, + "loss": 0.0024, + "num_tokens": 22614505.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 377.875, + "completions/mean_terminated_length": 377.875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5011990407673861, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.023827393422834575, + "learning_rate": 1.865246228671088e-05, + "loss": 0.001, + "num_tokens": 22621856.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 603.125, + "completions/mean_terminated_length": 603.125, + "completions/min_length": 545.0, + "completions/min_terminated_length": 545.0, + "epoch": 0.5013835085777532, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.02836527058389038, + "learning_rate": 1.86508475833094e-05, + "loss": 0.0011, + "num_tokens": 22639273.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 158.0, + "completions/mean_terminated_length": 158.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.5015679763881202, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.9375, + "kl": 0.060538227669894695, + "learning_rate": 1.864923198304558e-05, + "loss": 0.0024, + "num_tokens": 22643601.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 371.5, + "completions/mean_terminated_length": 371.5, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5017524441984874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.0789107196033001, + "learning_rate": 1.8647615486086898e-05, + "loss": 0.0032, + "num_tokens": 22651477.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.5019369120088545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.053563714027404785, + "learning_rate": 1.8645998092600957e-05, + "loss": 0.0021, + "num_tokens": 22657691.0, + "reward": 1.5625, + "reward_std": 0.04115033894777298, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.04115033894777298, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 249.875, + "completions/mean_terminated_length": 249.875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5021213798192216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.04365982650779188, + "learning_rate": 1.8644379802755428e-05, + "loss": 0.0017, + "num_tokens": 22666586.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.5023058476295886, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.045669612009078264, + "learning_rate": 1.8642760616718086e-05, + "loss": 0.0018, + "num_tokens": 22677809.0, + "reward": 0.9130434393882751, + "reward_std": 0.3797464668750763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.03804347664117813, + "rewards/fixed_code_pass_all_test_reward/std": 0.0913117453455925, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 328.75, + "completions/mean_terminated_length": 328.75, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.5024903154399557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.017441502306610346, + "learning_rate": 1.8641140534656798e-05, + "loss": 0.0007, + "num_tokens": 22684783.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 182.625, + "completions/mean_terminated_length": 182.625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.5026747832503228, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1064453125, + "kl": 0.06818964099511504, + "learning_rate": 1.863951955673953e-05, + "loss": 0.0027, + "num_tokens": 22693164.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 181.75, + "completions/mean_terminated_length": 181.75, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.50285925106069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.053072353824973106, + "learning_rate": 1.863789768313432e-05, + "loss": 0.0021, + "num_tokens": 22697522.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.503043718871057, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.07619306957349181, + "learning_rate": 1.8636274914009325e-05, + "loss": 0.003, + "num_tokens": 22703870.0, + "reward": 1.0535714626312256, + "reward_std": 0.0739356130361557, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0535714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.0739356055855751, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.5032281866814241, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.06275647808797657, + "learning_rate": 1.8634651249532778e-05, + "loss": 0.0025, + "num_tokens": 22712718.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 325.875, + "completions/mean_terminated_length": 325.875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.5034126544917912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033203125, + "kl": 0.011306561005767435, + "learning_rate": 1.863302668987301e-05, + "loss": 0.0005, + "num_tokens": 22721429.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 228.875, + "completions/mean_terminated_length": 228.875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.5035971223021583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.447265625, + "kl": 0.08814415335655212, + "learning_rate": 1.863140123519845e-05, + "loss": 0.0035, + "num_tokens": 22729812.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.5037815901125253, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.06425468460656703, + "learning_rate": 1.8629774885677604e-05, + "loss": 0.0026, + "num_tokens": 22737329.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 200.75, + "completions/mean_terminated_length": 200.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.5039660579228925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.049988374346867204, + "learning_rate": 1.8628147641479092e-05, + "loss": 0.002, + "num_tokens": 22744351.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 395.375, + "completions/mean_terminated_length": 395.375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.5041505257332596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.03743119747377932, + "learning_rate": 1.8626519502771606e-05, + "loss": 0.0015, + "num_tokens": 22752394.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 216.0, + "completions/mean_terminated_length": 216.0, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.5043349935436267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.04011024418286979, + "learning_rate": 1.862489046972395e-05, + "loss": 0.0016, + "num_tokens": 22756978.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 276.75, + "completions/mean_terminated_length": 276.75, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5045194613539937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.035857048351317644, + "learning_rate": 1.8623260542505005e-05, + "loss": 0.0014, + "num_tokens": 22763336.0, + "reward": 1.149999976158142, + "reward_std": 0.028284244239330292, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15000000596046448, + "rewards/fixed_code_pass_all_test_reward/std": 0.02828427031636238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 341.625, + "completions/mean_terminated_length": 341.625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.5047039291643608, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.05145525140687823, + "learning_rate": 1.8621629721283748e-05, + "loss": 0.0021, + "num_tokens": 22771085.0, + "reward": 1.671875, + "reward_std": 0.4565373361110687, + "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, + "rewards/fixed_code_pass_all_test_reward/std": 0.4565373361110687, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 283.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.5048883969747279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.04882565513253212, + "learning_rate": 1.8619998006229262e-05, + "loss": 0.002, + "num_tokens": 22780205.0, + "reward": 1.3571429252624512, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 476.5, + "completions/mean_terminated_length": 476.5, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.505072864785095, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.051930279238149524, + "learning_rate": 1.8618365397510704e-05, + "loss": 0.0021, + "num_tokens": 22794657.0, + "reward": 1.6171875, + "reward_std": 0.22018027305603027, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6171875, + "rewards/fixed_code_pass_all_test_reward/std": 0.22018027305603027, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.5052573325954621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.029758159071207047, + "learning_rate": 1.8616731895297338e-05, + "loss": 0.0012, + "num_tokens": 22803214.0, + "reward": 1.9791666269302368, + "reward_std": 0.058925606310367584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 244.75, + "completions/mean_terminated_length": 244.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.5054418004058292, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.07370346807874739, + "learning_rate": 1.8615097499758508e-05, + "loss": 0.0029, + "num_tokens": 22810772.0, + "reward": 1.5472973585128784, + "reward_std": 0.19794942438602448, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5472972393035889, + "rewards/fixed_code_pass_all_test_reward/std": 0.19794940948486328, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5056262682161963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.0553805495146662, + "learning_rate": 1.8613462211063663e-05, + "loss": 0.0022, + "num_tokens": 22820550.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 241.125, + "completions/mean_terminated_length": 241.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.5058107360265633, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.05754216713830829, + "learning_rate": 1.8611826029382334e-05, + "loss": 0.0023, + "num_tokens": 22830087.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 151.0, + "completions/mean_terminated_length": 151.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.5059952038369304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.03508240572409704, + "learning_rate": 1.8610188954884152e-05, + "loss": 0.0014, + "num_tokens": 22834359.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 165.625, + "completions/mean_terminated_length": 165.625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5061796716472975, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.07310201367363334, + "learning_rate": 1.8608550987738838e-05, + "loss": 0.0029, + "num_tokens": 22842508.0, + "reward": 1.4117647409439087, + "reward_std": 0.4871050715446472, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4117647111415863, + "rewards/fixed_code_pass_all_test_reward/std": 0.4871051013469696, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 387.5, + "completions/mean_terminated_length": 387.5, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.5063641394576647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.05301804514601827, + "learning_rate": 1.860691212811621e-05, + "loss": 0.0021, + "num_tokens": 22852632.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 255.75, + "completions/mean_terminated_length": 255.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.5065486072680317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66796875, + "kl": 0.048013882245868444, + "learning_rate": 1.8605272376186163e-05, + "loss": 0.0019, + "num_tokens": 22858558.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5067330750783988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.03405846678651869, + "learning_rate": 1.8603631732118705e-05, + "loss": 0.0014, + "num_tokens": 22864837.0, + "reward": 1.6666667461395264, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 133.25, + "completions/mean_terminated_length": 133.25, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5069175428887659, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.421875, + "kl": 0.1853981940075755, + "learning_rate": 1.8601990196083924e-05, + "loss": 0.0074, + "num_tokens": 22868735.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.507102010699133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10302734375, + "kl": 0.05060426262207329, + "learning_rate": 1.8600347768252006e-05, + "loss": 0.002, + "num_tokens": 22873155.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 133.375, + "completions/mean_terminated_length": 133.375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.5072864785095, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.640625, + "kl": 0.07958096358925104, + "learning_rate": 1.859870444879322e-05, + "loss": 0.0032, + "num_tokens": 22876990.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 545.125, + "completions/mean_terminated_length": 545.125, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.5074709463198672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.023994793533347547, + "learning_rate": 1.8597060237877944e-05, + "loss": 0.001, + "num_tokens": 22887279.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.5076554141302343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.04975181689951569, + "learning_rate": 1.859541513567663e-05, + "loss": 0.002, + "num_tokens": 22894538.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 672.75, + "completions/mean_terminated_length": 672.75, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.5078398819406014, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6015625, + "kl": 0.041624399134889245, + "learning_rate": 1.8593769142359838e-05, + "loss": 0.0017, + "num_tokens": 22911776.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 256.625, + "completions/mean_terminated_length": 256.625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.5080243497509684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.055431193206459284, + "learning_rate": 1.8592122258098208e-05, + "loss": 0.0022, + "num_tokens": 22922117.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 213.25, + "completions/mean_terminated_length": 213.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.5082088175613355, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.04109343606978655, + "learning_rate": 1.8590474483062483e-05, + "loss": 0.0016, + "num_tokens": 22929199.0, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 233.625, + "completions/mean_terminated_length": 233.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.5083932853717026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1962890625, + "kl": 0.05803778977133334, + "learning_rate": 1.858882581742349e-05, + "loss": 0.0023, + "num_tokens": 22939036.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 256.375, + "completions/mean_terminated_length": 256.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.5085777531820698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.03114568628370762, + "learning_rate": 1.8587176261352153e-05, + "loss": 0.0012, + "num_tokens": 22948959.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.5087622209924368, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8203125, + "kl": 0.06351406406611204, + "learning_rate": 1.8585525815019485e-05, + "loss": 0.0025, + "num_tokens": 22957596.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 405.875, + "completions/mean_terminated_length": 405.875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.5089466888028039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.03977635712362826, + "learning_rate": 1.85838744785966e-05, + "loss": 0.0016, + "num_tokens": 22970363.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 459.25, + "completions/mean_terminated_length": 459.25, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.509131156613171, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.03571559232659638, + "learning_rate": 1.8582222252254692e-05, + "loss": 0.0014, + "num_tokens": 22984437.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 281.0, + "completions/mean_terminated_length": 281.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5093156244235381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.06219602515920997, + "learning_rate": 1.858056913616505e-05, + "loss": 0.0025, + "num_tokens": 22991061.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 156.25, + "completions/mean_terminated_length": 156.25, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.5095000922339051, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.06495046429336071, + "learning_rate": 1.8578915130499063e-05, + "loss": 0.0026, + "num_tokens": 22995215.0, + "reward": 1.75, + "reward_std": 0.49601584672927856, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.24800792336463928, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 169.5, + "completions/mean_terminated_length": 169.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.5096845600442723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.058859196957200766, + "learning_rate": 1.8577260235428206e-05, + "loss": 0.0024, + "num_tokens": 23004027.0, + "reward": 1.101694941520691, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10169491171836853, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 235.625, + "completions/mean_terminated_length": 235.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5098690278546394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.04777829488739371, + "learning_rate": 1.857560445112405e-05, + "loss": 0.0019, + "num_tokens": 23013112.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 297.25, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5100534956650065, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.04993998957797885, + "learning_rate": 1.857394777775825e-05, + "loss": 0.002, + "num_tokens": 23020074.0, + "reward": 1.9711538553237915, + "reward_std": 0.05341268330812454, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.05341270938515663, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 196.0, + "completions/mean_terminated_length": 196.0, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5102379634753735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.03727330290712416, + "learning_rate": 1.8572290215502567e-05, + "loss": 0.0015, + "num_tokens": 23028946.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 303.625, + "completions/mean_terminated_length": 303.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.5104224312857406, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.04818772594444454, + "learning_rate": 1.8570631764528838e-05, + "loss": 0.0019, + "num_tokens": 23034727.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 379.625, + "completions/mean_terminated_length": 379.625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.5106068990961077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.032342077465727925, + "learning_rate": 1.8568972425009e-05, + "loss": 0.0013, + "num_tokens": 23042140.0, + "reward": 1.920454502105713, + "reward_std": 0.22498849034309387, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9204545617103577, + "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 218.125, + "completions/mean_terminated_length": 218.125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.5107913669064749, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.359375, + "kl": 0.047701354371383786, + "learning_rate": 1.856731219711509e-05, + "loss": 0.0019, + "num_tokens": 23048981.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5109758347168419, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.021630617673508823, + "learning_rate": 1.8565651081019223e-05, + "loss": 0.0009, + "num_tokens": 23054847.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 375.625, + "completions/mean_terminated_length": 375.625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.511160302527209, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.030856761848554015, + "learning_rate": 1.8563989076893617e-05, + "loss": 0.0012, + "num_tokens": 23065700.0, + "reward": 1.5892857313156128, + "reward_std": 0.49744242429733276, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.49744245409965515, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 521.0, + "completions/mean_terminated_length": 521.0, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.5113447703375761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2470703125, + "kl": 0.038683447637595236, + "learning_rate": 1.856232618491057e-05, + "loss": 0.0015, + "num_tokens": 23080236.0, + "reward": 1.4615384340286255, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4615384638309479, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 336.125, + "completions/mean_terminated_length": 336.125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5115292381479432, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.039023472694680095, + "learning_rate": 1.856066240524249e-05, + "loss": 0.0016, + "num_tokens": 23086077.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 191.125, + "completions/mean_terminated_length": 191.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.5117137059583102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09130859375, + "kl": 0.04408479807898402, + "learning_rate": 1.855899773806186e-05, + "loss": 0.0018, + "num_tokens": 23091118.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5118981737686774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.048626302159391344, + "learning_rate": 1.8557332183541262e-05, + "loss": 0.0019, + "num_tokens": 23097162.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 457.125, + "completions/mean_terminated_length": 457.125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.5120826415790445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78125, + "kl": 0.022265153587795794, + "learning_rate": 1.855566574185337e-05, + "loss": 0.0009, + "num_tokens": 23106099.0, + "reward": 1.8571429252624512, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.3499270975589752, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 239.375, + "completions/mean_terminated_length": 239.375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.5122671093894116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.026302648475393653, + "learning_rate": 1.855399841317095e-05, + "loss": 0.0011, + "num_tokens": 23111158.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 358.75, + "completions/mean_terminated_length": 358.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.5124515771997786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.025299595901742578, + "learning_rate": 1.855233019766686e-05, + "loss": 0.001, + "num_tokens": 23118084.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 305.625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.5126360450101457, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.046765504870563745, + "learning_rate": 1.855066109551405e-05, + "loss": 0.0019, + "num_tokens": 23124785.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 325.125, + "completions/mean_terminated_length": 325.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5128205128205128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.044155846000649035, + "learning_rate": 1.854899110688556e-05, + "loss": 0.0018, + "num_tokens": 23135306.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 306.375, + "completions/mean_terminated_length": 306.375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.51300498063088, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.05021716235205531, + "learning_rate": 1.8547320231954524e-05, + "loss": 0.002, + "num_tokens": 23145893.0, + "reward": 1.3977272510528564, + "reward_std": 0.2872230112552643, + "rewards/fixed_code_pass_all_test_reward/mean": 0.39772728085517883, + "rewards/fixed_code_pass_all_test_reward/std": 0.2872230112552643, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 278.875, + "completions/mean_terminated_length": 278.875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.513189448441247, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.07923578098416328, + "learning_rate": 1.8545648470894166e-05, + "loss": 0.0032, + "num_tokens": 23154612.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 347.375, + "completions/mean_terminated_length": 347.375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5133739162516141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.05200920789502561, + "learning_rate": 1.8543975823877803e-05, + "loss": 0.0021, + "num_tokens": 23161991.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 287.25, + "completions/mean_terminated_length": 287.25, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.5135583840619812, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.05446542892605066, + "learning_rate": 1.8542302291078846e-05, + "loss": 0.0022, + "num_tokens": 23173809.0, + "reward": 1.7999999523162842, + "reward_std": 0.38544961810112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 244.75, + "completions/mean_terminated_length": 244.75, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.5137428518723482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.07249522674828768, + "learning_rate": 1.8540627872670795e-05, + "loss": 0.0029, + "num_tokens": 23181271.0, + "reward": 1.6666667461395264, + "reward_std": 0.4714045226573944, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.4714045524597168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.5139273196827153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.044770811451599, + "learning_rate": 1.853895256882724e-05, + "loss": 0.0018, + "num_tokens": 23188322.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 238.25, + "completions/mean_terminated_length": 238.25, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5141117874930825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.0689567131921649, + "learning_rate": 1.8537276379721872e-05, + "loss": 0.0028, + "num_tokens": 23193108.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 199.375, + "completions/mean_terminated_length": 199.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.5142962553034496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.029760813689790666, + "learning_rate": 1.8535599305528463e-05, + "loss": 0.0012, + "num_tokens": 23198279.0, + "reward": 1.912500023841858, + "reward_std": 0.2474873811006546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9125000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.2474873811006546, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 219.0, + "completions/mean_terminated_length": 219.0, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.5144807231138167, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.06344792759045959, + "learning_rate": 1.853392134642088e-05, + "loss": 0.0025, + "num_tokens": 23205383.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 196.5, + "completions/mean_terminated_length": 196.5, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.5146651909241837, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.08982344134710729, + "learning_rate": 1.8532242502573078e-05, + "loss": 0.0036, + "num_tokens": 23214059.0, + "reward": 1.8802082538604736, + "reward_std": 0.33882203698158264, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8802083134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.33882200717926025, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 354.25, + "completions/mean_terminated_length": 354.25, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.5148496587345508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.047635109163820744, + "learning_rate": 1.853056277415912e-05, + "loss": 0.0019, + "num_tokens": 23221813.0, + "reward": 1.625, + "reward_std": 0.4464142918586731, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4464142918586731, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.5150341265449179, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.07893895031884313, + "learning_rate": 1.8528882161353138e-05, + "loss": 0.0032, + "num_tokens": 23228644.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 323.125, + "completions/mean_terminated_length": 323.125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.515218594355285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.028219442581757903, + "learning_rate": 1.8527200664329378e-05, + "loss": 0.0011, + "num_tokens": 23238461.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 288.5, + "completions/mean_terminated_length": 288.5, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5154030621656521, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.027992335613816977, + "learning_rate": 1.8525518283262153e-05, + "loss": 0.0011, + "num_tokens": 23247577.0, + "reward": 1.9015151262283325, + "reward_std": 0.2785572111606598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9015151262283325, + "rewards/fixed_code_pass_all_test_reward/std": 0.2785572111606598, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 374.875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5155875299760192, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.455078125, + "kl": 0.01824878208572045, + "learning_rate": 1.852383501832589e-05, + "loss": 0.0007, + "num_tokens": 23254824.0, + "reward": 1.984375, + "reward_std": 0.04419417306780815, + "rewards/fixed_code_pass_all_test_reward/mean": 0.984375, + "rewards/fixed_code_pass_all_test_reward/std": 0.04419417306780815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.5157719977863863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04242036887444556, + "learning_rate": 1.85221508696951e-05, + "loss": 0.0017, + "num_tokens": 23259169.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 792.0, + "completions/mean_terminated_length": 792.0, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "epoch": 0.5159564655967533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.765625, + "kl": 0.028247934067621827, + "learning_rate": 1.852046583754438e-05, + "loss": 0.0011, + "num_tokens": 23277897.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 352.625, + "completions/mean_terminated_length": 352.625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5161409334071204, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.045972500927746296, + "learning_rate": 1.8518779922048423e-05, + "loss": 0.0018, + "num_tokens": 23285414.0, + "reward": 1.7750000953674316, + "reward_std": 0.310529500246048, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 345.75, + "completions/mean_terminated_length": 345.75, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.5163254012174876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.03476559044793248, + "learning_rate": 1.8517093123382014e-05, + "loss": 0.0014, + "num_tokens": 23293332.0, + "reward": 1.0625, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 345.875, + "completions/mean_terminated_length": 345.875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.5165098690278547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.06239828537218273, + "learning_rate": 1.8515405441720027e-05, + "loss": 0.0025, + "num_tokens": 23305779.0, + "reward": 1.253151297569275, + "reward_std": 0.3803839087486267, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2531512677669525, + "rewards/fixed_code_pass_all_test_reward/std": 0.3803839087486267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 324.125, + "completions/mean_terminated_length": 324.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5166943368382217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.048747298191301525, + "learning_rate": 1.8513716877237436e-05, + "loss": 0.0019, + "num_tokens": 23313540.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.5168788046485888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.04933922551572323, + "learning_rate": 1.8512027430109296e-05, + "loss": 0.002, + "num_tokens": 23322841.0, + "reward": 1.807692289352417, + "reward_std": 0.3560846447944641, + "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.3560846745967865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 345.125, + "completions/mean_terminated_length": 345.125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.5170632724589559, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.05617962358519435, + "learning_rate": 1.851033710051076e-05, + "loss": 0.0022, + "num_tokens": 23334530.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 263.25, + "completions/mean_terminated_length": 263.25, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.517247740269323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.022077973233535886, + "learning_rate": 1.8508645888617065e-05, + "loss": 0.0009, + "num_tokens": 23339820.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 558.125, + "completions/mean_terminated_length": 558.125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.5174322080796901, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7265625, + "kl": 0.028335342882201076, + "learning_rate": 1.8506953794603548e-05, + "loss": 0.0011, + "num_tokens": 23349805.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 273.875, + "completions/mean_terminated_length": 273.875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.5176166758900572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.05826432514004409, + "learning_rate": 1.8505260818645635e-05, + "loss": 0.0023, + "num_tokens": 23356140.0, + "reward": 1.5700757503509521, + "reward_std": 0.6344894766807556, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6950757503509521, + "rewards/fixed_code_pass_all_test_reward/std": 0.28104063868522644, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 289.625, + "completions/mean_terminated_length": 289.625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5178011437004243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.04653330682776868, + "learning_rate": 1.8503566960918838e-05, + "loss": 0.0019, + "num_tokens": 23364833.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 305.875, + "completions/mean_terminated_length": 305.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5179856115107914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.189453125, + "kl": 0.07116739777848125, + "learning_rate": 1.8501872221598774e-05, + "loss": 0.0028, + "num_tokens": 23374248.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 351.75, + "completions/mean_terminated_length": 351.75, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.5181700793211584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.04943715361878276, + "learning_rate": 1.850017660086113e-05, + "loss": 0.002, + "num_tokens": 23383014.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 400.625, + "completions/mean_terminated_length": 400.625, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.5183545471315255, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7578125, + "kl": 0.027977523859590292, + "learning_rate": 1.849848009888171e-05, + "loss": 0.0011, + "num_tokens": 23391331.0, + "reward": 1.7374999523162842, + "reward_std": 0.36228445172309875, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7374999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.36228442192077637, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 260.75, + "completions/mean_terminated_length": 260.75, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.5185390149418926, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24609375, + "kl": 0.08416365180164576, + "learning_rate": 1.8496782715836386e-05, + "loss": 0.0034, + "num_tokens": 23396329.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.5187234827522598, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.04282577778212726, + "learning_rate": 1.8495084451901135e-05, + "loss": 0.0017, + "num_tokens": 23406719.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 279.875, + "completions/mean_terminated_length": 279.875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.5189079505626268, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.04288282827474177, + "learning_rate": 1.849338530725202e-05, + "loss": 0.0017, + "num_tokens": 23429734.0, + "reward": 1.9337348937988281, + "reward_std": 0.12269902974367142, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9337349534034729, + "rewards/fixed_code_pass_all_test_reward/std": 0.12269905209541321, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 170.75, + "completions/mean_terminated_length": 170.75, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.5190924183729939, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.05002048425376415, + "learning_rate": 1.8491685282065197e-05, + "loss": 0.002, + "num_tokens": 23433884.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 230.875, + "completions/mean_terminated_length": 230.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.519276886183361, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.15176389645785093, + "learning_rate": 1.848998437651692e-05, + "loss": 0.0061, + "num_tokens": 23439579.0, + "reward": 1.421875, + "reward_std": 0.48152241110801697, + "rewards/fixed_code_pass_all_test_reward/mean": 0.421875, + "rewards/fixed_code_pass_all_test_reward/std": 0.48152244091033936, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 309.125, + "completions/mean_terminated_length": 309.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5194613539937281, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.04074770538136363, + "learning_rate": 1.8488282590783517e-05, + "loss": 0.0016, + "num_tokens": 23446172.0, + "reward": 1.808333396911621, + "reward_std": 0.2653239965438843, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8083333373069763, + "rewards/fixed_code_pass_all_test_reward/std": 0.2653239965438843, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.5196458218040951, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.057164270896464586, + "learning_rate": 1.8486579925041425e-05, + "loss": 0.0023, + "num_tokens": 23450246.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 186.25, + "completions/mean_terminated_length": 186.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5198302896144623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.059136889642104506, + "learning_rate": 1.848487637946716e-05, + "loss": 0.0024, + "num_tokens": 23458752.0, + "reward": 1.3289473056793213, + "reward_std": 0.2723943293094635, + "rewards/fixed_code_pass_all_test_reward/mean": 0.32894736528396606, + "rewards/fixed_code_pass_all_test_reward/std": 0.2723943293094635, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5200147574248294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2578125, + "kl": 0.04421363747678697, + "learning_rate": 1.8483171954237344e-05, + "loss": 0.0018, + "num_tokens": 23463831.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5201992252351965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.043971346342004836, + "learning_rate": 1.848146664952867e-05, + "loss": 0.0018, + "num_tokens": 23471833.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 283.25, + "completions/mean_terminated_length": 283.25, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.5203836930455635, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.05586092174053192, + "learning_rate": 1.8479760465517933e-05, + "loss": 0.0022, + "num_tokens": 23480995.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 309.375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.5205681608559306, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3359375, + "kl": 0.0726253513712436, + "learning_rate": 1.847805340238203e-05, + "loss": 0.0029, + "num_tokens": 23491478.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 341.75, + "completions/mean_terminated_length": 341.75, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.5207526286662977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.0654238909482956, + "learning_rate": 1.8476345460297925e-05, + "loss": 0.0026, + "num_tokens": 23498612.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5209370964766649, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.03805620735511184, + "learning_rate": 1.847463663944269e-05, + "loss": 0.0015, + "num_tokens": 23505530.0, + "reward": 1.8973214626312256, + "reward_std": 0.2904188334941864, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8973214626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.2904188334941864, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 355.25, + "completions/mean_terminated_length": 355.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.5211215642870319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.052640109322965145, + "learning_rate": 1.8472926939993487e-05, + "loss": 0.0021, + "num_tokens": 23514020.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 248.5, + "completions/mean_terminated_length": 248.5, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.521306032097399, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03493537427857518, + "learning_rate": 1.847121636212757e-05, + "loss": 0.0014, + "num_tokens": 23522536.0, + "reward": 1.2992424964904785, + "reward_std": 0.29130449891090393, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29924243688583374, + "rewards/fixed_code_pass_all_test_reward/std": 0.2913045287132263, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 501.75, + "completions/mean_terminated_length": 501.75, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "epoch": 0.5214904999077661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.02579028159379959, + "learning_rate": 1.8469504906022267e-05, + "loss": 0.001, + "num_tokens": 23532470.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 221.625, + "completions/mean_terminated_length": 221.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.5216749677181332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10107421875, + "kl": 0.06299476441927254, + "learning_rate": 1.8467792571855027e-05, + "loss": 0.0025, + "num_tokens": 23537995.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 138.0, + "completions/mean_terminated_length": 138.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.5218594355285002, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.053371525602415204, + "learning_rate": 1.846607935980336e-05, + "loss": 0.0021, + "num_tokens": 23542067.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 258.75, + "completions/mean_terminated_length": 258.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5220439033388674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10791015625, + "kl": 0.04979613143950701, + "learning_rate": 1.8464365270044884e-05, + "loss": 0.002, + "num_tokens": 23550193.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 322.0, + "completions/mean_terminated_length": 322.0, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5222283711492345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.050818806514143944, + "learning_rate": 1.846265030275731e-05, + "loss": 0.002, + "num_tokens": 23560689.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 306.5, + "completions/mean_terminated_length": 306.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.5224128389596016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.04412054060958326, + "learning_rate": 1.8460934458118425e-05, + "loss": 0.0018, + "num_tokens": 23568941.0, + "reward": 1.8461538553237915, + "reward_std": 0.3513047993183136, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8461538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.3513047993183136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 346.5, + "completions/mean_terminated_length": 346.5, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.5225973067699686, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.04565321700647473, + "learning_rate": 1.8459217736306125e-05, + "loss": 0.0018, + "num_tokens": 23576145.0, + "reward": 1.076923131942749, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07692307978868484, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5227817745803357, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.03350291715469211, + "learning_rate": 1.8457500137498385e-05, + "loss": 0.0013, + "num_tokens": 23581066.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 218.625, + "completions/mean_terminated_length": 218.625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5229662423907028, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.04699177551083267, + "learning_rate": 1.8455781661873268e-05, + "loss": 0.0019, + "num_tokens": 23585583.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 285.125, + "completions/mean_terminated_length": 285.125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.52315071020107, + "frac_reward_zero_std": 0.0, + "grad_norm": 12.9375, + "kl": 0.09024004125967622, + "learning_rate": 1.8454062309608946e-05, + "loss": 0.0036, + "num_tokens": 23591048.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 184.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.523335178011437, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1904296875, + "kl": 0.0644664082210511, + "learning_rate": 1.8452342080883665e-05, + "loss": 0.0026, + "num_tokens": 23595376.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 237.375, + "completions/mean_terminated_length": 237.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.5235196458218041, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.06963581638410687, + "learning_rate": 1.8450620975875763e-05, + "loss": 0.0028, + "num_tokens": 23602035.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 250.875, + "completions/mean_terminated_length": 250.875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.5237041136321712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.03225951734930277, + "learning_rate": 1.844889899476368e-05, + "loss": 0.0013, + "num_tokens": 23609778.0, + "reward": 1.345588207244873, + "reward_std": 0.26760777831077576, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34558823704719543, + "rewards/fixed_code_pass_all_test_reward/std": 0.26760780811309814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5238885814425382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.028353850822895765, + "learning_rate": 1.8447176137725934e-05, + "loss": 0.0011, + "num_tokens": 23616226.0, + "reward": 1.6022727489471436, + "reward_std": 0.09642365574836731, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6022727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 317.5, + "completions/mean_terminated_length": 317.5, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.5240730492529053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.07353658927604556, + "learning_rate": 1.8445452404941136e-05, + "loss": 0.0029, + "num_tokens": 23627558.0, + "reward": 1.875, + "reward_std": 0.2749859690666199, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2749859690666199, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.5242575170632725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.04875091719441116, + "learning_rate": 1.8443727796588003e-05, + "loss": 0.002, + "num_tokens": 23637089.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 169.125, + "completions/mean_terminated_length": 169.125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.5244419848736396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.13551864679902792, + "learning_rate": 1.844200231284532e-05, + "loss": 0.0054, + "num_tokens": 23645370.0, + "reward": 1.893617033958435, + "reward_std": 0.3008964955806732, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8936170339584351, + "rewards/fixed_code_pass_all_test_reward/std": 0.3008965253829956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5246264526840066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15625, + "kl": 0.06745268451049924, + "learning_rate": 1.8440275953891976e-05, + "loss": 0.0027, + "num_tokens": 23649833.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 126.0, + "completions/mean_terminated_length": 126.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.5248109204943737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.06305395509116352, + "learning_rate": 1.8438548719906954e-05, + "loss": 0.0025, + "num_tokens": 23653689.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 135.0, + "completions/mean_terminated_length": 135.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.5249953883047408, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.03375433851033449, + "learning_rate": 1.8436820611069315e-05, + "loss": 0.0014, + "num_tokens": 23657521.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.5251798561151079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.022012108704075217, + "learning_rate": 1.843509162755822e-05, + "loss": 0.0009, + "num_tokens": 23662250.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 154.0, + "completions/mean_terminated_length": 154.0, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.525364323925475, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.04795918520539999, + "learning_rate": 1.843336176955292e-05, + "loss": 0.0019, + "num_tokens": 23668834.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 109.25, + "completions/mean_terminated_length": 109.25, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.5255487917358421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.04178744845557958, + "learning_rate": 1.8431631037232756e-05, + "loss": 0.0017, + "num_tokens": 23672492.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 314.625, + "completions/mean_terminated_length": 314.625, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.5257332595462092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.07368254032917321, + "learning_rate": 1.8429899430777157e-05, + "loss": 0.0029, + "num_tokens": 23679425.0, + "reward": 1.8142857551574707, + "reward_std": 0.28448712825775146, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8142856955528259, + "rewards/fixed_code_pass_all_test_reward/std": 0.28448715806007385, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 316.625, + "completions/mean_terminated_length": 316.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5259177273565763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.06328038731589913, + "learning_rate": 1.8428166950365645e-05, + "loss": 0.0025, + "num_tokens": 23690078.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5261021951669433, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.021372593473643064, + "learning_rate": 1.8426433596177832e-05, + "loss": 0.0009, + "num_tokens": 23700256.0, + "reward": 1.9680118560791016, + "reward_std": 0.02648874931037426, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9680118560791016, + "rewards/fixed_code_pass_all_test_reward/std": 0.02648872509598732, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 249.875, + "completions/mean_terminated_length": 249.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5262866629773104, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.049210532335564494, + "learning_rate": 1.8424699368393423e-05, + "loss": 0.002, + "num_tokens": 23711303.0, + "reward": 1.807692289352417, + "reward_std": 0.3560846447944641, + "rewards/fixed_code_pass_all_test_reward/mean": 0.807692289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.3560846745967865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 280.75, + "completions/mean_terminated_length": 280.75, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.5264711307876776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.033354627434164286, + "learning_rate": 1.8422964267192204e-05, + "loss": 0.0013, + "num_tokens": 23722093.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5266555985980447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.03479305584914982, + "learning_rate": 1.8421228292754064e-05, + "loss": 0.0014, + "num_tokens": 23728975.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 490.75, + "completions/mean_terminated_length": 490.75, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.5268400664084117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.03214071015827358, + "learning_rate": 1.8419491445258977e-05, + "loss": 0.0013, + "num_tokens": 23741989.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 251.5, + "completions/mean_terminated_length": 251.5, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.5270245342187788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.02958951867185533, + "learning_rate": 1.8417753724887008e-05, + "loss": 0.0012, + "num_tokens": 23750457.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 134.625, + "completions/mean_terminated_length": 134.625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5272090020291459, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12109375, + "kl": 0.050456660450436175, + "learning_rate": 1.8416015131818312e-05, + "loss": 0.002, + "num_tokens": 23754486.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.527393469839513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8125, + "kl": 0.08452646480873227, + "learning_rate": 1.8414275666233137e-05, + "loss": 0.0034, + "num_tokens": 23761658.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 326.75, + "completions/mean_terminated_length": 326.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5275779376498801, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.08769943751394749, + "learning_rate": 1.8412535328311813e-05, + "loss": 0.0035, + "num_tokens": 23771096.0, + "reward": 1.9838709831237793, + "reward_std": 0.029865136370062828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9838709831237793, + "rewards/fixed_code_pass_all_test_reward/std": 0.029865162447094917, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 377.75, + "completions/mean_terminated_length": 377.75, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.5277624054602472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.02376266405917704, + "learning_rate": 1.841079411823477e-05, + "loss": 0.001, + "num_tokens": 23777838.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 421.75, + "completions/mean_terminated_length": 421.75, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.5279468732706143, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69921875, + "kl": 0.023376462049782276, + "learning_rate": 1.840905203618253e-05, + "loss": 0.0009, + "num_tokens": 23790188.0, + "reward": 1.9107142686843872, + "reward_std": 0.25253817439079285, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.5281313410809814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04476763564161956, + "learning_rate": 1.8407309082335692e-05, + "loss": 0.0018, + "num_tokens": 23799877.0, + "reward": 1.4319853782653809, + "reward_std": 0.7195343375205994, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5569853186607361, + "rewards/fixed_code_pass_all_test_reward/std": 0.48331257700920105, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 356.125, + "completions/mean_terminated_length": 356.125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.5283158088913484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04008093476295471, + "learning_rate": 1.8405565256874962e-05, + "loss": 0.0016, + "num_tokens": 23811758.0, + "reward": 1.0457316637039185, + "reward_std": 0.027463216334581375, + "rewards/fixed_code_pass_all_test_reward/mean": 0.04573170840740204, + "rewards/fixed_code_pass_all_test_reward/std": 0.027463210746645927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 312.375, + "completions/mean_terminated_length": 312.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.5285002767017155, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.03383087646216154, + "learning_rate": 1.840382055998112e-05, + "loss": 0.0014, + "num_tokens": 23821673.0, + "reward": 1.8617424964904785, + "reward_std": 0.24667370319366455, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8617424368858337, + "rewards/fixed_code_pass_all_test_reward/std": 0.24667373299598694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 312.375, + "completions/mean_terminated_length": 312.375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.5286847445120827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.03994820243678987, + "learning_rate": 1.8402074991835052e-05, + "loss": 0.0016, + "num_tokens": 23831156.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 365.875, + "completions/mean_terminated_length": 365.875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.5288692123224498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.03670158772729337, + "learning_rate": 1.8400328552617723e-05, + "loss": 0.0015, + "num_tokens": 23838787.0, + "reward": 1.6453489065170288, + "reward_std": 0.19879494607448578, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6453487873077393, + "rewards/fixed_code_pass_all_test_reward/std": 0.19879494607448578, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 431.25, + "completions/mean_terminated_length": 431.25, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5290536801328168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.08484932337887585, + "learning_rate": 1.839858124251019e-05, + "loss": 0.0034, + "num_tokens": 23849941.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 200.625, + "completions/mean_terminated_length": 200.625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.5292381479431839, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.04827801161445677, + "learning_rate": 1.8396833061693607e-05, + "loss": 0.0019, + "num_tokens": 23855370.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 234.625, + "completions/mean_terminated_length": 234.625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.529422615753551, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08056640625, + "kl": 0.038210643688216805, + "learning_rate": 1.8395084010349214e-05, + "loss": 0.0015, + "num_tokens": 23860295.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.5296070835639181, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.06456323666498065, + "learning_rate": 1.8393334088658337e-05, + "loss": 0.0026, + "num_tokens": 23868979.0, + "reward": 1.7083332538604736, + "reward_std": 0.360958456993103, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.360958456993103, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.5297915513742852, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.06536012096330523, + "learning_rate": 1.8391583296802397e-05, + "loss": 0.0026, + "num_tokens": 23873900.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 353.0, + "completions/mean_terminated_length": 353.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.5299760191846523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.03246226394549012, + "learning_rate": 1.8389831634962907e-05, + "loss": 0.0013, + "num_tokens": 23880996.0, + "reward": 1.8235294818878174, + "reward_std": 0.11336753517389297, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8235294222831726, + "rewards/fixed_code_pass_all_test_reward/std": 0.11336753517389297, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 219.375, + "completions/mean_terminated_length": 219.375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5301604869950194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.037451985059306026, + "learning_rate": 1.838807910332147e-05, + "loss": 0.0015, + "num_tokens": 23885855.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 209.125, + "completions/mean_terminated_length": 209.125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5303449548053865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.06036352412775159, + "learning_rate": 1.8386325702059767e-05, + "loss": 0.0024, + "num_tokens": 23890376.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 545.625, + "completions/mean_terminated_length": 545.625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.5305294226157535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6875, + "kl": 0.020776367862708867, + "learning_rate": 1.838457143135959e-05, + "loss": 0.0008, + "num_tokens": 23899645.0, + "reward": 1.9285714626312256, + "reward_std": 0.13225999474525452, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9285714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.1322600096464157, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 282.25, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.5307138904261206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.06739483680576086, + "learning_rate": 1.8382816291402803e-05, + "loss": 0.0027, + "num_tokens": 23908711.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 416.75, + "completions/mean_terminated_length": 416.75, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.5308983582364877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.06516665406525135, + "learning_rate": 1.838106028237137e-05, + "loss": 0.0026, + "num_tokens": 23919445.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 335.375, + "completions/mean_terminated_length": 335.375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5310828260468549, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.061262635281309485, + "learning_rate": 1.8379303404447343e-05, + "loss": 0.0025, + "num_tokens": 23929648.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 249.0, + "completions/mean_terminated_length": 249.0, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.5312672938572219, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.05810755817219615, + "learning_rate": 1.837754565781286e-05, + "loss": 0.0023, + "num_tokens": 23935080.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 219.125, + "completions/mean_terminated_length": 219.125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.531451761667589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052490234375, + "kl": 0.02686466625891626, + "learning_rate": 1.8375787042650154e-05, + "loss": 0.0011, + "num_tokens": 23940249.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 234.875, + "completions/mean_terminated_length": 234.875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.5316362294779561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.032278833677992225, + "learning_rate": 1.837402755914155e-05, + "loss": 0.0013, + "num_tokens": 23945192.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 526.125, + "completions/mean_terminated_length": 526.125, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.5318206972883232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66015625, + "kl": 0.02161016664467752, + "learning_rate": 1.8372267207469458e-05, + "loss": 0.0009, + "num_tokens": 23955473.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 435.375, + "completions/mean_terminated_length": 435.375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.5320051650986902, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.04296040034387261, + "learning_rate": 1.837050598781638e-05, + "loss": 0.0017, + "num_tokens": 23964236.0, + "reward": 1.892045497894287, + "reward_std": 0.2874155640602112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8920454978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.28741559386253357, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 423.25, + "completions/mean_terminated_length": 191.1428680419922, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5321896329090574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.51171875, + "kl": 0.0417083139764145, + "learning_rate": 1.83687439003649e-05, + "loss": 0.0017, + "num_tokens": 23974910.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 329.875, + "completions/mean_terminated_length": 329.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5323741007194245, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.07393796229735017, + "learning_rate": 1.836698094529771e-05, + "loss": 0.003, + "num_tokens": 23981957.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 223.5, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5325585685297916, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.05093437968753278, + "learning_rate": 1.8365217122797573e-05, + "loss": 0.002, + "num_tokens": 23989329.0, + "reward": 1.7272727489471436, + "reward_std": 0.16833092272281647, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 330.25, + "completions/mean_terminated_length": 330.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5327430363401586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.0502249994315207, + "learning_rate": 1.8363452433047356e-05, + "loss": 0.002, + "num_tokens": 23996339.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 339.875, + "completions/mean_terminated_length": 339.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.5329275041505257, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.05096303578466177, + "learning_rate": 1.8361686876230013e-05, + "loss": 0.002, + "num_tokens": 24006506.0, + "reward": 1.0416667461395264, + "reward_std": 0.11785116046667099, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, + "rewards/fixed_code_pass_all_test_reward/std": 0.1178511455655098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 380.75, + "completions/mean_terminated_length": 380.75, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.5331119719608928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.045845608692616224, + "learning_rate": 1.8359920452528577e-05, + "loss": 0.0018, + "num_tokens": 24016800.0, + "reward": 1.8948863744735718, + "reward_std": 0.2820216715335846, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8948863744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.282021701335907, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 223.625, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.53329643977126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2236328125, + "kl": 0.07154117804020643, + "learning_rate": 1.8358153162126183e-05, + "loss": 0.0029, + "num_tokens": 24022093.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 424.625, + "completions/mean_terminated_length": 424.625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.533480907581627, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.03469070536084473, + "learning_rate": 1.835638500520605e-05, + "loss": 0.0014, + "num_tokens": 24030490.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 285.625, + "completions/mean_terminated_length": 285.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.5336653753919941, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.08241589739918709, + "learning_rate": 1.8354615981951492e-05, + "loss": 0.0033, + "num_tokens": 24035847.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.5338498432023612, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.0436588975135237, + "learning_rate": 1.835284609254591e-05, + "loss": 0.0017, + "num_tokens": 24041836.0, + "reward": 1.09375, + "reward_std": 0.1735912710428238, + "rewards/fixed_code_pass_all_test_reward/mean": 0.09375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1735912710428238, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 378.25, + "completions/mean_terminated_length": 378.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5340343110127282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.031775604002177715, + "learning_rate": 1.835107533717279e-05, + "loss": 0.0013, + "num_tokens": 24052846.0, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 119.75, + "completions/mean_terminated_length": 119.75, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.5342187788230953, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.08700443943962455, + "learning_rate": 1.834930371601572e-05, + "loss": 0.0035, + "num_tokens": 24056668.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 416.875, + "completions/mean_terminated_length": 416.875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.5344032466334625, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.203125, + "kl": 0.1006898726336658, + "learning_rate": 1.8347531229258356e-05, + "loss": 0.004, + "num_tokens": 24066675.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 325.0, + "completions/mean_terminated_length": 325.0, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.5345877144438296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.07155615463852882, + "learning_rate": 1.8345757877084472e-05, + "loss": 0.0029, + "num_tokens": 24076123.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 233.875, + "completions/mean_terminated_length": 233.875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.5347721822541966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.02944751491304487, + "learning_rate": 1.8343983659677912e-05, + "loss": 0.0012, + "num_tokens": 24081730.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 269.125, + "completions/mean_terminated_length": 269.125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5349566500645637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.06105159432627261, + "learning_rate": 1.834220857722261e-05, + "loss": 0.0024, + "num_tokens": 24090595.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 160.0, + "completions/max_terminated_length": 160.0, + "completions/mean_length": 147.0, + "completions/mean_terminated_length": 147.0, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.5351411178749308, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.04223503312096, + "learning_rate": 1.8340432629902603e-05, + "loss": 0.0017, + "num_tokens": 24094483.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 638.625, + "completions/mean_terminated_length": 638.625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5353255856852979, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.05114214518107474, + "learning_rate": 1.8338655817902005e-05, + "loss": 0.002, + "num_tokens": 24108736.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 413.625, + "completions/mean_terminated_length": 413.625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.535510053495665, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.07517574448138475, + "learning_rate": 1.8336878141405026e-05, + "loss": 0.003, + "num_tokens": 24118877.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 322.625, + "completions/mean_terminated_length": 322.625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5356945213060321, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.04034720407798886, + "learning_rate": 1.833509960059596e-05, + "loss": 0.0016, + "num_tokens": 24125490.0, + "reward": 1.4583332538604736, + "reward_std": 0.44854259490966797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4583333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.44854262471199036, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 288.375, + "completions/mean_terminated_length": 288.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.5358789891163992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.02978092583362013, + "learning_rate": 1.8333320195659197e-05, + "loss": 0.0012, + "num_tokens": 24132221.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 384.25, + "completions/mean_terminated_length": 384.25, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.5360634569267663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.06300222594290972, + "learning_rate": 1.8331539926779214e-05, + "loss": 0.0025, + "num_tokens": 24141807.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 283.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.5362479247371333, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.03432313329540193, + "learning_rate": 1.832975879414058e-05, + "loss": 0.0014, + "num_tokens": 24151252.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 419.625, + "completions/mean_terminated_length": 419.625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5364323925475004, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.03202848241198808, + "learning_rate": 1.8327976797927946e-05, + "loss": 0.0013, + "num_tokens": 24160617.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 279.625, + "completions/mean_terminated_length": 279.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.5366168603578676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.09069602843374014, + "learning_rate": 1.832619393832606e-05, + "loss": 0.0036, + "num_tokens": 24168982.0, + "reward": 1.875, + "reward_std": 0.02022511698305607, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.02022511698305607, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 532.75, + "completions/mean_terminated_length": 532.75, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.5368013281682347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.04365922592114657, + "learning_rate": 1.8324410215519755e-05, + "loss": 0.0017, + "num_tokens": 24184460.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 646.625, + "completions/mean_terminated_length": 646.625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "epoch": 0.5369857959786017, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.58203125, + "kl": 0.0212044978979975, + "learning_rate": 1.8322625629693957e-05, + "loss": 0.0008, + "num_tokens": 24198761.0, + "reward": 1.4583333730697632, + "reward_std": 0.501980185508728, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 486.25, + "completions/mean_terminated_length": 486.25, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.5371702637889688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.03286727063823491, + "learning_rate": 1.832084018103368e-05, + "loss": 0.0013, + "num_tokens": 24211291.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 243.5, + "completions/mean_terminated_length": 243.5, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.5373547315993359, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.059785882011055946, + "learning_rate": 1.8319053869724032e-05, + "loss": 0.0024, + "num_tokens": 24216351.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 223.125, + "completions/mean_terminated_length": 223.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.537539199409703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.030113519955193624, + "learning_rate": 1.8317266695950196e-05, + "loss": 0.0012, + "num_tokens": 24221560.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 539.625, + "completions/mean_terminated_length": 539.625, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.5377236672200701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.042458487674593925, + "learning_rate": 1.8315478659897464e-05, + "loss": 0.0017, + "num_tokens": 24231309.0, + "reward": 1.1666667461395264, + "reward_std": 0.10286887735128403, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, + "rewards/fixed_code_pass_all_test_reward/std": 0.10286889970302582, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 489.875, + "completions/mean_terminated_length": 489.875, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.5379081350304372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.048516184790059924, + "learning_rate": 1.8313689761751197e-05, + "loss": 0.0019, + "num_tokens": 24241692.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 415.125, + "completions/mean_terminated_length": 415.125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.5380926028408043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.031217518961057067, + "learning_rate": 1.831190000169687e-05, + "loss": 0.0012, + "num_tokens": 24251709.0, + "reward": 1.942307710647583, + "reward_std": 0.1631784588098526, + "rewards/fixed_code_pass_all_test_reward/mean": 0.942307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.1631784737110138, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 493.375, + "completions/mean_terminated_length": 493.375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.5382770706511714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.0719007640145719, + "learning_rate": 1.831010937992002e-05, + "loss": 0.0029, + "num_tokens": 24260248.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 350.375, + "completions/mean_terminated_length": 350.375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.5384615384615384, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.05667279870249331, + "learning_rate": 1.8308317896606298e-05, + "loss": 0.0023, + "num_tokens": 24268795.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.5386460062719055, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.04545099101960659, + "learning_rate": 1.8306525551941424e-05, + "loss": 0.0018, + "num_tokens": 24275893.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 315.0, + "completions/mean_terminated_length": 315.0, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5388304740822727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.04677803046070039, + "learning_rate": 1.8304732346111224e-05, + "loss": 0.0019, + "num_tokens": 24284413.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 402.5, + "completions/mean_terminated_length": 402.5, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.5390149418926398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.030509008560329676, + "learning_rate": 1.8302938279301597e-05, + "loss": 0.0012, + "num_tokens": 24293833.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 334.0, + "completions/mean_terminated_length": 334.0, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.5391994097030068, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.05443519772961736, + "learning_rate": 1.8301143351698547e-05, + "loss": 0.0022, + "num_tokens": 24301057.0, + "reward": 1.6304347515106201, + "reward_std": 0.33597835898399353, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6304348111152649, + "rewards/fixed_code_pass_all_test_reward/std": 0.33597832918167114, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 439.5, + "completions/mean_terminated_length": 439.5, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.5393838775133739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.04643758945167065, + "learning_rate": 1.8299347563488158e-05, + "loss": 0.0019, + "num_tokens": 24312133.0, + "reward": 0.9516129493713379, + "reward_std": 0.5123711228370667, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20161288976669312, + "rewards/fixed_code_pass_all_test_reward/std": 0.08224225044250488, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.539568345323741, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.02165124472230673, + "learning_rate": 1.8297550914856602e-05, + "loss": 0.0009, + "num_tokens": 24316589.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 324.125, + "completions/mean_terminated_length": 324.125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.5397528131341081, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.043638721108436584, + "learning_rate": 1.8295753405990152e-05, + "loss": 0.0017, + "num_tokens": 24326086.0, + "reward": 1.7236841917037964, + "reward_std": 0.4282745122909546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7236841917037964, + "rewards/fixed_code_pass_all_test_reward/std": 0.428274542093277, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 427.625, + "completions/mean_terminated_length": 427.625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.5399372809444752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.04224176541902125, + "learning_rate": 1.8293955037075152e-05, + "loss": 0.0017, + "num_tokens": 24337275.0, + "reward": 1.03125, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.03125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 248.75, + "completions/mean_terminated_length": 248.75, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5401217487548423, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.0699792105006054, + "learning_rate": 1.8292155808298054e-05, + "loss": 0.0028, + "num_tokens": 24342729.0, + "reward": 0.9375, + "reward_std": 0.33407655358314514, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0625, + "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 288.0, + "completions/mean_terminated_length": 288.0, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.5403062165652094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.09865896543487906, + "learning_rate": 1.8290355719845384e-05, + "loss": 0.0039, + "num_tokens": 24349033.0, + "reward": 1.4375, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 397.75, + "completions/mean_terminated_length": 397.75, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.5404906843755765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.04213524959050119, + "learning_rate": 1.828855477190376e-05, + "loss": 0.0017, + "num_tokens": 24356799.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 492.375, + "completions/mean_terminated_length": 492.375, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.5406751521859435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.02584608062170446, + "learning_rate": 1.8286752964659902e-05, + "loss": 0.001, + "num_tokens": 24366314.0, + "reward": 1.7708333730697632, + "reward_std": 0.09708036482334137, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.09708039462566376, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5408596199963106, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.06530005624517798, + "learning_rate": 1.8284950298300605e-05, + "loss": 0.0026, + "num_tokens": 24376050.0, + "reward": 1.796875, + "reward_std": 0.3892385959625244, + "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 507.625, + "completions/mean_terminated_length": 507.625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.5410440878066778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.030729789868928492, + "learning_rate": 1.8283146773012754e-05, + "loss": 0.0012, + "num_tokens": 24387287.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 130.75, + "completions/mean_terminated_length": 130.75, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.5412285556170449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.03776670293882489, + "learning_rate": 1.8281342388983332e-05, + "loss": 0.0015, + "num_tokens": 24391045.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 283.375, + "completions/mean_terminated_length": 283.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.5414130234274119, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03434872371144593, + "learning_rate": 1.82795371463994e-05, + "loss": 0.0014, + "num_tokens": 24397520.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 449.5, + "completions/mean_terminated_length": 449.5, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.541597491237779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.034962830948643386, + "learning_rate": 1.827773104544812e-05, + "loss": 0.0014, + "num_tokens": 24406308.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 295.125, + "completions/mean_terminated_length": 295.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.5417819590481461, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.0374046522192657, + "learning_rate": 1.827592408631673e-05, + "loss": 0.0015, + "num_tokens": 24412549.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 924.75, + "completions/mean_terminated_length": 924.75, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "epoch": 0.5419664268585132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.578125, + "kl": 0.015889783506281674, + "learning_rate": 1.827411626919257e-05, + "loss": 0.0006, + "num_tokens": 24430483.0, + "reward": 1.7708332538604736, + "reward_std": 0.17677666246891022, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7708333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766773700714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 263.875, + "completions/mean_terminated_length": 263.875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.5421508946688803, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.890625, + "kl": 0.07926293043419719, + "learning_rate": 1.8272307594263056e-05, + "loss": 0.0032, + "num_tokens": 24436546.0, + "reward": 1.8250000476837158, + "reward_std": 0.36154431104660034, + "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 595.875, + "completions/mean_terminated_length": 595.875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.5423353624792474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.58203125, + "kl": 0.018995413440279663, + "learning_rate": 1.8270498061715703e-05, + "loss": 0.0008, + "num_tokens": 24445945.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 440.75, + "completions/mean_terminated_length": 440.75, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.5425198302896145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.03580684750340879, + "learning_rate": 1.826868767173811e-05, + "loss": 0.0014, + "num_tokens": 24453727.0, + "reward": 1.6527776718139648, + "reward_std": 0.20945467054843903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6527777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.20945467054843903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 441.875, + "completions/mean_terminated_length": 441.875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.5427042980999816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.033507868414744735, + "learning_rate": 1.826687642451797e-05, + "loss": 0.0013, + "num_tokens": 24464790.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 494.5, + "completions/mean_terminated_length": 494.5, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.5428887659103486, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.050762703409418464, + "learning_rate": 1.8265064320243056e-05, + "loss": 0.002, + "num_tokens": 24476642.0, + "reward": 1.52173912525177, + "reward_std": 0.22411949932575226, + "rewards/fixed_code_pass_all_test_reward/mean": 0.52173912525177, + "rewards/fixed_code_pass_all_test_reward/std": 0.22411948442459106, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 621.75, + "completions/mean_terminated_length": 621.75, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "epoch": 0.5430732337207157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.05861212476156652, + "learning_rate": 1.826325135910124e-05, + "loss": 0.0023, + "num_tokens": 24491360.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 322.0, + "completions/mean_terminated_length": 322.0, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.5432577015310828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.05620584962889552, + "learning_rate": 1.826143754128047e-05, + "loss": 0.0022, + "num_tokens": 24499376.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 727.625, + "completions/mean_terminated_length": 727.625, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.54344216934145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.03077749314252287, + "learning_rate": 1.8259622866968793e-05, + "loss": 0.0012, + "num_tokens": 24511725.0, + "reward": 1.8229167461395264, + "reward_std": 0.2651650607585907, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 384.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.543626637151817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.06243800977244973, + "learning_rate": 1.8257807336354353e-05, + "loss": 0.0025, + "num_tokens": 24522037.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 365.625, + "completions/mean_terminated_length": 365.625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.5438111049621841, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.05290189303923398, + "learning_rate": 1.8255990949625356e-05, + "loss": 0.0021, + "num_tokens": 24532666.0, + "reward": 1.8928570747375488, + "reward_std": 0.30304577946662903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 399.75, + "completions/mean_terminated_length": 399.75, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.5439955727725512, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.05672035925090313, + "learning_rate": 1.8254173706970125e-05, + "loss": 0.0023, + "num_tokens": 24540304.0, + "reward": 1.732758641242981, + "reward_std": 0.24014461040496826, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7327585816383362, + "rewards/fixed_code_pass_all_test_reward/std": 0.24014464020729065, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 405.125, + "completions/mean_terminated_length": 405.125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.5441800405829182, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.05459321022499353, + "learning_rate": 1.8252355608577054e-05, + "loss": 0.0022, + "num_tokens": 24550225.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 377.125, + "completions/mean_terminated_length": 377.125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.5443645083932853, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.205078125, + "kl": 0.05239320429973304, + "learning_rate": 1.825053665463463e-05, + "loss": 0.0021, + "num_tokens": 24559874.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 365.875, + "completions/mean_terminated_length": 365.875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.5445489762036525, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.04318390681874007, + "learning_rate": 1.8248716845331435e-05, + "loss": 0.0017, + "num_tokens": 24566945.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 289.375, + "completions/mean_terminated_length": 289.375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5447334440140196, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.05769297340884805, + "learning_rate": 1.8246896180856132e-05, + "loss": 0.0023, + "num_tokens": 24575884.0, + "reward": 1.0138888359069824, + "reward_std": 0.03928373008966446, + "rewards/fixed_code_pass_all_test_reward/mean": 0.013888888992369175, + "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 516.25, + "completions/mean_terminated_length": 516.25, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.5449179118243866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.02848245552740991, + "learning_rate": 1.824507466139748e-05, + "loss": 0.0011, + "num_tokens": 24584422.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 464.5, + "completions/mean_terminated_length": 464.5, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.5451023796347537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.02924036094918847, + "learning_rate": 1.8243252287144312e-05, + "loss": 0.0012, + "num_tokens": 24595882.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 216.25, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.5452868474451208, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.06980294804088771, + "learning_rate": 1.8241429058285568e-05, + "loss": 0.0028, + "num_tokens": 24600796.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 316.5, + "completions/mean_terminated_length": 316.5, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.5454713152554879, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.0644823091570288, + "learning_rate": 1.8239604975010266e-05, + "loss": 0.0026, + "num_tokens": 24611304.0, + "reward": 1.5035715103149414, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 296.125, + "completions/mean_terminated_length": 296.125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.545655783065855, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.07922717509791255, + "learning_rate": 1.8237780037507512e-05, + "loss": 0.0032, + "num_tokens": 24619297.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 146.5, + "completions/mean_terminated_length": 146.5, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.5458402508762221, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.052835587644949555, + "learning_rate": 1.823595424596651e-05, + "loss": 0.0021, + "num_tokens": 24623245.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 258.0, + "completions/mean_terminated_length": 258.0, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.5460247186865892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10009765625, + "kl": 0.038339386926963925, + "learning_rate": 1.823412760057654e-05, + "loss": 0.0015, + "num_tokens": 24631877.0, + "reward": 1.059999942779541, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05999999865889549, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 374.5, + "completions/mean_terminated_length": 374.5, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5462091864969563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053955078125, + "kl": 0.026171072269789875, + "learning_rate": 1.8232300101526977e-05, + "loss": 0.001, + "num_tokens": 24639329.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 248.25, + "completions/mean_terminated_length": 248.25, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.5463936543073233, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.04062277381308377, + "learning_rate": 1.8230471749007286e-05, + "loss": 0.0016, + "num_tokens": 24647051.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 340.375, + "completions/mean_terminated_length": 340.375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.5465781221176904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.028938467847183347, + "learning_rate": 1.822864254320702e-05, + "loss": 0.0012, + "num_tokens": 24656390.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.5467625899280576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.044765309197828174, + "learning_rate": 1.8226812484315813e-05, + "loss": 0.0018, + "num_tokens": 24660535.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.5469470577384247, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.04779591062106192, + "learning_rate": 1.8224981572523397e-05, + "loss": 0.0019, + "num_tokens": 24665908.0, + "reward": 1.451612949371338, + "reward_std": 0.05973030999302864, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4516128897666931, + "rewards/fixed_code_pass_all_test_reward/std": 0.059730324894189835, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 410.25, + "completions/mean_terminated_length": 410.25, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.5471315255487917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.019439691270235926, + "learning_rate": 1.8223149808019588e-05, + "loss": 0.0008, + "num_tokens": 24676726.0, + "reward": 1.9525315761566162, + "reward_std": 0.13426080346107483, + "rewards/fixed_code_pass_all_test_reward/mean": 0.952531635761261, + "rewards/fixed_code_pass_all_test_reward/std": 0.13426078855991364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 155.25, + "completions/mean_terminated_length": 155.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.5473159933591588, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.34375, + "kl": 0.07478530332446098, + "learning_rate": 1.8221317190994296e-05, + "loss": 0.003, + "num_tokens": 24680704.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.0, + "completions/max_terminated_length": 1545.0, + "completions/mean_length": 1259.0, + "completions/mean_terminated_length": 1259.0, + "completions/min_length": 1041.0, + "completions/min_terminated_length": 1041.0, + "epoch": 0.5475004611695259, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.609375, + "kl": 0.03837016923353076, + "learning_rate": 1.8219483721637506e-05, + "loss": 0.0015, + "num_tokens": 24703392.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 302.625, + "completions/mean_terminated_length": 302.625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.547684928979893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1669921875, + "kl": 0.03157455159816891, + "learning_rate": 1.8217649400139307e-05, + "loss": 0.0013, + "num_tokens": 24709733.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 423.625, + "completions/mean_terminated_length": 423.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.5478693967902601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.0464710455853492, + "learning_rate": 1.8215814226689867e-05, + "loss": 0.0019, + "num_tokens": 24720562.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 488.625, + "completions/mean_terminated_length": 488.625, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.5480538646006272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.03459405468311161, + "learning_rate": 1.821397820147944e-05, + "loss": 0.0014, + "num_tokens": 24729807.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1913.0, + "completions/max_terminated_length": 1913.0, + "completions/mean_length": 598.0, + "completions/mean_terminated_length": 598.0, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.5482383324109943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.58984375, + "kl": 0.03416860767174512, + "learning_rate": 1.821214132469838e-05, + "loss": 0.0014, + "num_tokens": 24742167.0, + "reward": 1.6677632331848145, + "reward_std": 0.712117075920105, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7927631735801697, + "rewards/fixed_code_pass_all_test_reward/std": 0.3944704830646515, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 284.375, + "completions/mean_terminated_length": 284.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.5484228002213614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.026601176243275404, + "learning_rate": 1.8210303596537118e-05, + "loss": 0.0011, + "num_tokens": 24747570.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 346.25, + "completions/mean_terminated_length": 346.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.5486072680317284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.03270949190482497, + "learning_rate": 1.8208465017186178e-05, + "loss": 0.0013, + "num_tokens": 24758244.0, + "reward": 1.9234694242477417, + "reward_std": 0.21646122634410858, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9234694242477417, + "rewards/fixed_code_pass_all_test_reward/std": 0.21646127104759216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 353.875, + "completions/mean_terminated_length": 353.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.5487917358420955, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.0776722792070359, + "learning_rate": 1.8206625586836174e-05, + "loss": 0.0031, + "num_tokens": 24765035.0, + "reward": 1.816176414489746, + "reward_std": 0.3686121702194214, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8161764740943909, + "rewards/fixed_code_pass_all_test_reward/std": 0.3686121702194214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 287.5, + "completions/mean_terminated_length": 287.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5489762036524627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.040628436487168074, + "learning_rate": 1.8204785305677807e-05, + "loss": 0.0016, + "num_tokens": 24774255.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 162.875, + "completions/mean_terminated_length": 162.875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.5491606714628298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.306640625, + "kl": 0.08926429552957416, + "learning_rate": 1.8202944173901856e-05, + "loss": 0.0036, + "num_tokens": 24778222.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 528.625, + "completions/mean_terminated_length": 528.625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.5493451392731968, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.03684735030401498, + "learning_rate": 1.8201102191699205e-05, + "loss": 0.0015, + "num_tokens": 24787491.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 691.0, + "completions/mean_terminated_length": 691.0, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "epoch": 0.5495296070835639, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.703125, + "kl": 0.020021209842525423, + "learning_rate": 1.819925935926082e-05, + "loss": 0.0008, + "num_tokens": 24799859.0, + "reward": 1.4791666269302368, + "reward_std": 0.43129098415374756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.43129101395606995, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 300.875, + "completions/mean_terminated_length": 300.875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.549714074893931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.0536803244613111, + "learning_rate": 1.8197415676777747e-05, + "loss": 0.0021, + "num_tokens": 24810170.0, + "reward": 1.6285715103149414, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6285714507102966, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 340.0, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.549898542704298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1923828125, + "kl": 0.06499074632301927, + "learning_rate": 1.8195571144441137e-05, + "loss": 0.0026, + "num_tokens": 24817402.0, + "reward": 1.1612902879714966, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.16129031777381897, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 457.75, + "completions/mean_terminated_length": 457.75, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.5500830105146652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.026158926193602383, + "learning_rate": 1.8193725762442206e-05, + "loss": 0.001, + "num_tokens": 24829872.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 265.75, + "completions/mean_terminated_length": 265.75, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.5502674783250323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.07190307090058923, + "learning_rate": 1.819187953097228e-05, + "loss": 0.0029, + "num_tokens": 24836398.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 642.625, + "completions/mean_terminated_length": 642.625, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "epoch": 0.5504519461353994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.625, + "kl": 0.04322959540877491, + "learning_rate": 1.819003245022276e-05, + "loss": 0.0017, + "num_tokens": 24847747.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 293.0, + "completions/mean_terminated_length": 293.0, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.5506364139457665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.03564634523354471, + "learning_rate": 1.818818452038514e-05, + "loss": 0.0014, + "num_tokens": 24853787.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 133.625, + "completions/mean_terminated_length": 133.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.5508208817561335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.259765625, + "kl": 0.06751763564534485, + "learning_rate": 1.8186335741651e-05, + "loss": 0.0027, + "num_tokens": 24857696.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 372.75, + "completions/mean_terminated_length": 372.75, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5510053495665006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.046009131940081716, + "learning_rate": 1.8184486114212012e-05, + "loss": 0.0018, + "num_tokens": 24867390.0, + "reward": 1.6307470798492432, + "reward_std": 0.25275692343711853, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6307471394538879, + "rewards/fixed_code_pass_all_test_reward/std": 0.25275692343711853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 935.75, + "completions/mean_terminated_length": 776.857177734375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.5511898173768678, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.267578125, + "kl": 0.022496830439195037, + "learning_rate": 1.8182635638259932e-05, + "loss": 0.0009, + "num_tokens": 24881452.0, + "reward": 1.716346025466919, + "reward_std": 0.3557506501674652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8413461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.013598216697573662, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 236.625, + "completions/mean_terminated_length": 236.625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.5513742851872349, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.056536976946517825, + "learning_rate": 1.8180784313986603e-05, + "loss": 0.0023, + "num_tokens": 24890785.0, + "reward": 1.3134920597076416, + "reward_std": 0.45380091667175293, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3134920597076416, + "rewards/fixed_code_pass_all_test_reward/std": 0.45380091667175293, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 102.5, + "completions/mean_terminated_length": 102.5, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.5515587529976019, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.07373393326997757, + "learning_rate": 1.817893214158396e-05, + "loss": 0.0029, + "num_tokens": 24894333.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 248.125, + "completions/mean_terminated_length": 248.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.551743220807969, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1787109375, + "kl": 0.052373128943145275, + "learning_rate": 1.8177079121244023e-05, + "loss": 0.0021, + "num_tokens": 24899230.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 221.75, + "completions/mean_terminated_length": 221.75, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.5519276886183361, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.03477802500128746, + "learning_rate": 1.8175225253158903e-05, + "loss": 0.0014, + "num_tokens": 24905516.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 483.75, + "completions/mean_terminated_length": 483.75, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.5521121564287031, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.018508803623262793, + "learning_rate": 1.8173370537520792e-05, + "loss": 0.0007, + "num_tokens": 24914802.0, + "reward": 1.8020833730697632, + "reward_std": 0.3271373212337494, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.3271373510360718, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 490.25, + "completions/mean_terminated_length": 490.25, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.5522966242390703, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.025187405291944742, + "learning_rate": 1.8171514974521982e-05, + "loss": 0.001, + "num_tokens": 24923196.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 352.625, + "completions/mean_terminated_length": 352.625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5524810920494374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.03938381338957697, + "learning_rate": 1.816965856435484e-05, + "loss": 0.0016, + "num_tokens": 24931857.0, + "reward": 1.9936224222183228, + "reward_std": 0.018038442358374596, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9936224222183228, + "rewards/fixed_code_pass_all_test_reward/std": 0.01803842931985855, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 279.5, + "completions/mean_terminated_length": 279.5, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5526655598598045, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.04877182422205806, + "learning_rate": 1.816780130721182e-05, + "loss": 0.002, + "num_tokens": 24937877.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 281.5, + "completions/mean_terminated_length": 281.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.5528500276701716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.0493067423813045, + "learning_rate": 1.8165943203285484e-05, + "loss": 0.002, + "num_tokens": 24943633.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 353.25, + "completions/mean_terminated_length": 353.25, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.5530344954805386, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.0677804525475949, + "learning_rate": 1.8164084252768463e-05, + "loss": 0.0027, + "num_tokens": 24950539.0, + "reward": 1.779411792755127, + "reward_std": 0.054460056126117706, + "rewards/fixed_code_pass_all_test_reward/mean": 0.779411792755127, + "rewards/fixed_code_pass_all_test_reward/std": 0.05446000397205353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 458.0, + "completions/mean_terminated_length": 458.0, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.5532189632909057, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.05057398322969675, + "learning_rate": 1.8162224455853474e-05, + "loss": 0.002, + "num_tokens": 24963915.0, + "reward": 1.875, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 530.375, + "completions/mean_terminated_length": 530.375, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.5534034311012729, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.039687435259111226, + "learning_rate": 1.8160363812733336e-05, + "loss": 0.0016, + "num_tokens": 24973558.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 303.875, + "completions/mean_terminated_length": 303.875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.55358789891164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.04045181313995272, + "learning_rate": 1.8158502323600943e-05, + "loss": 0.0016, + "num_tokens": 24981029.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 325.625, + "completions/mean_terminated_length": 325.625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.553772366722007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0277099609375, + "kl": 0.01385993673466146, + "learning_rate": 1.8156639988649285e-05, + "loss": 0.0006, + "num_tokens": 24991154.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.5539568345323741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.050731312832795084, + "learning_rate": 1.8154776808071436e-05, + "loss": 0.002, + "num_tokens": 25000017.0, + "reward": 1.0416666269302368, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0416666679084301, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 220.25, + "completions/mean_terminated_length": 220.25, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.5541413023427412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1630859375, + "kl": 0.054657270666211843, + "learning_rate": 1.8152912782060556e-05, + "loss": 0.0022, + "num_tokens": 25006931.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 354.75, + "completions/mean_terminated_length": 354.75, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.5543257701531082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.04281378327868879, + "learning_rate": 1.8151047910809898e-05, + "loss": 0.0017, + "num_tokens": 25017217.0, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 498.875, + "completions/mean_terminated_length": 498.875, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.5545102379634754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032958984375, + "kl": 0.023587030009366572, + "learning_rate": 1.81491821945128e-05, + "loss": 0.0009, + "num_tokens": 25030024.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 215.625, + "completions/mean_terminated_length": 215.625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.5546947057738425, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.34375, + "kl": 0.06078086397610605, + "learning_rate": 1.8147315633362682e-05, + "loss": 0.0024, + "num_tokens": 25034901.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 276.125, + "completions/mean_terminated_length": 276.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5548791735842096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.04330155742354691, + "learning_rate": 1.814544822755306e-05, + "loss": 0.0017, + "num_tokens": 25041126.0, + "reward": 0.9471153616905212, + "reward_std": 0.42079177498817444, + "rewards/fixed_code_pass_all_test_reward/mean": 0.19711539149284363, + "rewards/fixed_code_pass_all_test_reward/std": 0.19223898649215698, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 122.875, + "completions/mean_terminated_length": 122.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.5550636413945766, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.953125, + "kl": 0.07741780462674797, + "learning_rate": 1.8143579977277534e-05, + "loss": 0.0031, + "num_tokens": 25044805.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 401.0, + "completions/mean_terminated_length": 401.0, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.5552481092049437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06641779514029622, + "learning_rate": 1.8141710882729792e-05, + "loss": 0.0027, + "num_tokens": 25056597.0, + "reward": 1.8977272510528564, + "reward_std": 0.28927093744277954, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8977272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.28927096724510193, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 294.625, + "completions/mean_terminated_length": 294.625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5554325770153108, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.030493160127662122, + "learning_rate": 1.813984094410361e-05, + "loss": 0.0012, + "num_tokens": 25063162.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 285.875, + "completions/mean_terminated_length": 285.875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.5556170448256779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.036026187939569354, + "learning_rate": 1.813797016159285e-05, + "loss": 0.0014, + "num_tokens": 25071313.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 517.0, + "completions/mean_terminated_length": 517.0, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.555801512636045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.018412979901768267, + "learning_rate": 1.813609853539146e-05, + "loss": 0.0007, + "num_tokens": 25079977.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 207.5, + "completions/mean_terminated_length": 207.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5559859804464121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.04467348847538233, + "learning_rate": 1.813422606569348e-05, + "loss": 0.0018, + "num_tokens": 25084445.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 363.25, + "completions/mean_terminated_length": 363.25, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.5561704482567792, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.03971067944075912, + "learning_rate": 1.8132352752693038e-05, + "loss": 0.0016, + "num_tokens": 25091967.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 239.125, + "completions/mean_terminated_length": 239.125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.5563549160671463, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.0852273222990334, + "learning_rate": 1.813047859658434e-05, + "loss": 0.0034, + "num_tokens": 25101112.0, + "reward": 1.5059523582458496, + "reward_std": 0.5282528400421143, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5059523582458496, + "rewards/fixed_code_pass_all_test_reward/std": 0.5282528400421143, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 358.0, + "completions/mean_terminated_length": 358.0, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.5565393838775133, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.05209075682796538, + "learning_rate": 1.8128603597561693e-05, + "loss": 0.0021, + "num_tokens": 25108192.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 206.375, + "completions/mean_terminated_length": 206.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.5567238516878804, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.028394917375408113, + "learning_rate": 1.8126727755819477e-05, + "loss": 0.0011, + "num_tokens": 25112659.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 352.25, + "completions/mean_terminated_length": 352.25, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.5569083194982476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.028425589785911143, + "learning_rate": 1.8124851071552176e-05, + "loss": 0.0011, + "num_tokens": 25121429.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 366.125, + "completions/mean_terminated_length": 366.125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5570927873086147, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.03395583666861057, + "learning_rate": 1.8122973544954346e-05, + "loss": 0.0014, + "num_tokens": 25132254.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 395.375, + "completions/mean_terminated_length": 395.375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.5572772551189817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.03574182232841849, + "learning_rate": 1.8121095176220635e-05, + "loss": 0.0014, + "num_tokens": 25141977.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.5574617229293488, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.04595293151214719, + "learning_rate": 1.811921596554578e-05, + "loss": 0.0018, + "num_tokens": 25148627.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 189.375, + "completions/mean_terminated_length": 189.375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.5576461907397159, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.018557009403593838, + "learning_rate": 1.8117335913124613e-05, + "loss": 0.0007, + "num_tokens": 25153198.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 631.25, + "completions/mean_terminated_length": 631.25, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.557830658550083, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.028582318976987153, + "learning_rate": 1.8115455019152038e-05, + "loss": 0.0011, + "num_tokens": 25168432.0, + "reward": 1.1333333253860474, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 371.875, + "completions/mean_terminated_length": 371.875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.5580151263604501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.03259554575197399, + "learning_rate": 1.8113573283823056e-05, + "loss": 0.0013, + "num_tokens": 25179767.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 406.75, + "completions/mean_terminated_length": 406.75, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.5581995941708172, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.05117500759661198, + "learning_rate": 1.811169070733275e-05, + "loss": 0.002, + "num_tokens": 25190781.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 253.0, + "completions/mean_terminated_length": 253.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5583840619811843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.048692459939047694, + "learning_rate": 1.8109807289876294e-05, + "loss": 0.0019, + "num_tokens": 25197029.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 301.75, + "completions/mean_terminated_length": 301.75, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.5585685297915514, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.04838477657176554, + "learning_rate": 1.8107923031648952e-05, + "loss": 0.0019, + "num_tokens": 25207691.0, + "reward": 1.75, + "reward_std": 0.3162277340888977, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.3162277638912201, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 160.875, + "completions/mean_terminated_length": 160.875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.5587529976019184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.119140625, + "kl": 0.07281749369576573, + "learning_rate": 1.810603793284607e-05, + "loss": 0.0029, + "num_tokens": 25213794.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 400.125, + "completions/mean_terminated_length": 400.125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5589374654122855, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.041095297085121274, + "learning_rate": 1.810415199366308e-05, + "loss": 0.0016, + "num_tokens": 25221651.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 289.125, + "completions/mean_terminated_length": 289.125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.5591219332226527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04006575420498848, + "learning_rate": 1.8102265214295506e-05, + "loss": 0.0016, + "num_tokens": 25230580.0, + "reward": 1.9083333015441895, + "reward_std": 0.2592725157737732, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9083333015441895, + "rewards/fixed_code_pass_all_test_reward/std": 0.2592725157737732, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 302.375, + "completions/mean_terminated_length": 302.375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.5593064010330198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.02845871902536601, + "learning_rate": 1.8100377594938948e-05, + "loss": 0.0011, + "num_tokens": 25237023.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 374.625, + "completions/mean_terminated_length": 374.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.5594908688433868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.04617732926271856, + "learning_rate": 1.809848913578912e-05, + "loss": 0.0018, + "num_tokens": 25247388.0, + "reward": 1.7395832538604736, + "reward_std": 0.4351552426815033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.32101497054100037, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 318.75, + "completions/mean_terminated_length": 318.75, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5596753366537539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.032301858940627426, + "learning_rate": 1.8096599837041786e-05, + "loss": 0.0013, + "num_tokens": 25254378.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 289.125, + "completions/mean_terminated_length": 289.125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.559859804464121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.04430544306524098, + "learning_rate": 1.809470969889283e-05, + "loss": 0.0018, + "num_tokens": 25260555.0, + "reward": 1.7333333492279053, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7333333492279053, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 220.25, + "completions/mean_terminated_length": 220.25, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.560044272274488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.02662464464083314, + "learning_rate": 1.80928187215382e-05, + "loss": 0.0011, + "num_tokens": 25265205.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 252.75, + "completions/mean_terminated_length": 252.75, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.5602287400848552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.03058117430191487, + "learning_rate": 1.8090926905173944e-05, + "loss": 0.0012, + "num_tokens": 25273851.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 348.625, + "completions/mean_terminated_length": 348.625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.5604132078952223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.02531021786853671, + "learning_rate": 1.8089034249996193e-05, + "loss": 0.001, + "num_tokens": 25280400.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 348.875, + "completions/mean_terminated_length": 348.875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5605976757055894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.06138602364808321, + "learning_rate": 1.8087140756201164e-05, + "loss": 0.0025, + "num_tokens": 25291095.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 155.125, + "completions/mean_terminated_length": 155.125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.5607821435159565, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.0846156133338809, + "learning_rate": 1.8085246423985157e-05, + "loss": 0.0034, + "num_tokens": 25295016.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 312.375, + "completions/mean_terminated_length": 312.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.5609666113263235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051513671875, + "kl": 0.036287104012444615, + "learning_rate": 1.808335125354457e-05, + "loss": 0.0015, + "num_tokens": 25306211.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.5611510791366906, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.03876070445403457, + "learning_rate": 1.8081455245075885e-05, + "loss": 0.0016, + "num_tokens": 25313547.0, + "reward": 1.076923131942749, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07692307978868484, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 497.625, + "completions/mean_terminated_length": 497.625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.5613355469470578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.03324142191559076, + "learning_rate": 1.807955839877566e-05, + "loss": 0.0013, + "num_tokens": 25327168.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 624.25, + "completions/mean_terminated_length": 624.25, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "epoch": 0.5615200147574249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.03152924100868404, + "learning_rate": 1.8077660714840552e-05, + "loss": 0.0013, + "num_tokens": 25338234.0, + "reward": 1.137930989265442, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13793103396892548, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 297.25, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.5617044825677919, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.061444617342203856, + "learning_rate": 1.8075762193467296e-05, + "loss": 0.0025, + "num_tokens": 25346980.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 524.375, + "completions/mean_terminated_length": 524.375, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.561888950378159, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.036382993683218956, + "learning_rate": 1.807386283485272e-05, + "loss": 0.0015, + "num_tokens": 25357015.0, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 424.5, + "completions/mean_terminated_length": 424.5, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.5620734181885261, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.03431850136257708, + "learning_rate": 1.807196263919374e-05, + "loss": 0.0014, + "num_tokens": 25364779.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 261.125, + "completions/mean_terminated_length": 261.125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.5622578859988931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.02997777797281742, + "learning_rate": 1.8070061606687354e-05, + "loss": 0.0012, + "num_tokens": 25370564.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 152.0, + "completions/mean_terminated_length": 152.0, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.5624423538092603, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.265625, + "kl": 0.06041325395926833, + "learning_rate": 1.8068159737530644e-05, + "loss": 0.0024, + "num_tokens": 25374692.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 332.375, + "completions/mean_terminated_length": 332.375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.5626268216196274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.0637503219768405, + "learning_rate": 1.8066257031920788e-05, + "loss": 0.0026, + "num_tokens": 25382791.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 255.25, + "completions/mean_terminated_length": 255.25, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.5628112894299945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.05269561498425901, + "learning_rate": 1.8064353490055046e-05, + "loss": 0.0021, + "num_tokens": 25390393.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 141.625, + "completions/mean_terminated_length": 141.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.5629957572403615, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10595703125, + "kl": 0.056827078107744455, + "learning_rate": 1.806244911213076e-05, + "loss": 0.0023, + "num_tokens": 25394246.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 662.0, + "completions/mean_terminated_length": 662.0, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.5631802250507286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.047894252464175224, + "learning_rate": 1.8060543898345368e-05, + "loss": 0.0019, + "num_tokens": 25410182.0, + "reward": 1.1411290168762207, + "reward_std": 0.6388952732086182, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2661290168762207, + "rewards/fixed_code_pass_all_test_reward/std": 0.4551376700401306, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 242.5, + "completions/mean_terminated_length": 242.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.5633646928610957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.04374005342833698, + "learning_rate": 1.8058637848896387e-05, + "loss": 0.0017, + "num_tokens": 25418490.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 366.375, + "completions/mean_terminated_length": 366.375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5635491606714629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.038498512003570795, + "learning_rate": 1.8056730963981426e-05, + "loss": 0.0015, + "num_tokens": 25427549.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 272.0, + "completions/mean_terminated_length": 272.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.56373362848183, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.04802217590622604, + "learning_rate": 1.8054823243798178e-05, + "loss": 0.0019, + "num_tokens": 25432445.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 319.75, + "completions/mean_terminated_length": 319.75, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.563918096292197, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69140625, + "kl": 0.03593837353400886, + "learning_rate": 1.8052914688544416e-05, + "loss": 0.0014, + "num_tokens": 25439139.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 218.625, + "completions/mean_terminated_length": 218.625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5641025641025641, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.023547242977656424, + "learning_rate": 1.8051005298418016e-05, + "loss": 0.0009, + "num_tokens": 25443968.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.5642870319129312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.028751458739861846, + "learning_rate": 1.8049095073616927e-05, + "loss": 0.0012, + "num_tokens": 25450577.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 683.375, + "completions/mean_terminated_length": 683.375, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "epoch": 0.5644714997232982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662109375, + "kl": 0.02337012381758541, + "learning_rate": 1.804718401433919e-05, + "loss": 0.0009, + "num_tokens": 25471260.0, + "reward": 1.0322580337524414, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.032258063554763794, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 291.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.5646559675336654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.05154152354225516, + "learning_rate": 1.8045272120782926e-05, + "loss": 0.0021, + "num_tokens": 25479527.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 282.625, + "completions/mean_terminated_length": 282.625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5648404353440325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.07118550734594464, + "learning_rate": 1.804335939314635e-05, + "loss": 0.0028, + "num_tokens": 25487228.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 385.375, + "completions/mean_terminated_length": 385.375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.5650249031543996, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.051041068974882364, + "learning_rate": 1.8041445831627765e-05, + "loss": 0.002, + "num_tokens": 25496831.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 260.25, + "completions/mean_terminated_length": 260.25, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.5652093709647666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.05082690296694636, + "learning_rate": 1.8039531436425548e-05, + "loss": 0.002, + "num_tokens": 25504009.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 500.125, + "completions/mean_terminated_length": 500.125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.5653938387751337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.033118094550445676, + "learning_rate": 1.803761620773818e-05, + "loss": 0.0013, + "num_tokens": 25512810.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 512.375, + "completions/mean_terminated_length": 512.375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.5655783065855008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.027211367269046605, + "learning_rate": 1.8035700145764213e-05, + "loss": 0.0011, + "num_tokens": 25522877.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 336.5, + "completions/mean_terminated_length": 336.5, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.565762774395868, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.053582322085276246, + "learning_rate": 1.803378325070229e-05, + "loss": 0.0021, + "num_tokens": 25532625.0, + "reward": 1.578125, + "reward_std": 0.46740877628326416, + "rewards/fixed_code_pass_all_test_reward/mean": 0.578125, + "rewards/fixed_code_pass_all_test_reward/std": 0.46740880608558655, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 376.25, + "completions/mean_terminated_length": 376.25, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.565947242206235, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.03318937751464546, + "learning_rate": 1.8031865522751147e-05, + "loss": 0.0013, + "num_tokens": 25542843.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.5661317100166021, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3046875, + "kl": 0.052035853266716, + "learning_rate": 1.80299469621096e-05, + "loss": 0.0021, + "num_tokens": 25551843.0, + "reward": 1.8333332538604736, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8333333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 522.0, + "completions/mean_terminated_length": 522.0, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.5663161778269692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.03220933233387768, + "learning_rate": 1.8028027568976555e-05, + "loss": 0.0013, + "num_tokens": 25565651.0, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 349.25, + "completions/mean_terminated_length": 349.25, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5665006456373363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.040862676221877337, + "learning_rate": 1.8026107343550996e-05, + "loss": 0.0016, + "num_tokens": 25575301.0, + "reward": 1.850806474685669, + "reward_std": 0.35038575530052185, + "rewards/fixed_code_pass_all_test_reward/mean": 0.850806474685669, + "rewards/fixed_code_pass_all_test_reward/std": 0.35038575530052185, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 517.25, + "completions/mean_terminated_length": 517.25, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.5666851134477033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.05594922695308924, + "learning_rate": 1.8024186286032002e-05, + "loss": 0.0022, + "num_tokens": 25584455.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 460.75, + "completions/mean_terminated_length": 460.75, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.5668695812580705, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.03176037850789726, + "learning_rate": 1.8022264396618732e-05, + "loss": 0.0013, + "num_tokens": 25592805.0, + "reward": 1.894230842590332, + "reward_std": 0.1958465278148651, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8942307829856873, + "rewards/fixed_code_pass_all_test_reward/std": 0.1958465576171875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5670540490684376, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.057953457813709974, + "learning_rate": 1.8020341675510446e-05, + "loss": 0.0023, + "num_tokens": 25604821.0, + "reward": 1.567307710647583, + "reward_std": 0.2670634984970093, + "rewards/fixed_code_pass_all_test_reward/mean": 0.567307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.2670634984970093, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 217.375, + "completions/mean_terminated_length": 217.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5672385168788047, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.06375005072914064, + "learning_rate": 1.8018418122906465e-05, + "loss": 0.0026, + "num_tokens": 25610032.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.5674229846891717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.06240540253929794, + "learning_rate": 1.801649373900622e-05, + "loss": 0.0025, + "num_tokens": 25619114.0, + "reward": 1.315000057220459, + "reward_std": 0.5802216529846191, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4399999976158142, + "rewards/fixed_code_pass_all_test_reward/std": 0.2931601107120514, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 525.25, + "completions/mean_terminated_length": 525.25, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.5676074524995388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.029175078379921615, + "learning_rate": 1.8014568524009216e-05, + "loss": 0.0012, + "num_tokens": 25628420.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 421.625, + "completions/mean_terminated_length": 421.625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.5677919203099059, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.041583041893318295, + "learning_rate": 1.801264247811504e-05, + "loss": 0.0017, + "num_tokens": 25640017.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 488.875, + "completions/mean_terminated_length": 488.875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.567976388120273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.042831100639887154, + "learning_rate": 1.8010715601523383e-05, + "loss": 0.0017, + "num_tokens": 25650792.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 300.5, + "completions/mean_terminated_length": 300.5, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.5681608559306401, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04248046875, + "kl": 0.035763921681791544, + "learning_rate": 1.8008787894434003e-05, + "loss": 0.0014, + "num_tokens": 25659972.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 304.375, + "completions/mean_terminated_length": 304.375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.5683453237410072, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.04125258978456259, + "learning_rate": 1.800685935704675e-05, + "loss": 0.0017, + "num_tokens": 25669231.0, + "reward": 1.90625, + "reward_std": 0.0578637570142746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 652.625, + "completions/mean_terminated_length": 652.625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "epoch": 0.5685297915513743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.04210096853785217, + "learning_rate": 1.8004929989561566e-05, + "loss": 0.0017, + "num_tokens": 25681076.0, + "reward": 1.254166603088379, + "reward_std": 0.4604819715023041, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2541666626930237, + "rewards/fixed_code_pass_all_test_reward/std": 0.46048200130462646, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 261.75, + "completions/mean_terminated_length": 261.75, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5687142593617414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.0304510846035555, + "learning_rate": 1.8002999792178478e-05, + "loss": 0.0012, + "num_tokens": 25689730.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 185.125, + "completions/mean_terminated_length": 185.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.5688987271721084, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.05038401670753956, + "learning_rate": 1.8001068765097585e-05, + "loss": 0.002, + "num_tokens": 25694115.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 235.25, + "completions/mean_terminated_length": 235.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5690831949824755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.06559236533939838, + "learning_rate": 1.79991369085191e-05, + "loss": 0.0026, + "num_tokens": 25699253.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.5692676627928427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0597276974003762, + "learning_rate": 1.799720422264329e-05, + "loss": 0.0024, + "num_tokens": 25704296.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 278.75, + "completions/mean_terminated_length": 278.75, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.5694521306032098, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.04970954800955951, + "learning_rate": 1.799527070767053e-05, + "loss": 0.002, + "num_tokens": 25714470.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 424.75, + "completions/mean_terminated_length": 424.75, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.5696365984135768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.03950870712287724, + "learning_rate": 1.7993336363801272e-05, + "loss": 0.0016, + "num_tokens": 25722724.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 320.625, + "completions/mean_terminated_length": 320.625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.5698210662239439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.04021455207839608, + "learning_rate": 1.7991401191236053e-05, + "loss": 0.0016, + "num_tokens": 25732521.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 301.75, + "completions/mean_terminated_length": 301.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.570005534034311, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.039639255846850574, + "learning_rate": 1.7989465190175507e-05, + "loss": 0.0016, + "num_tokens": 25742855.0, + "reward": 1.081730842590332, + "reward_std": 0.4832080900669098, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20673078298568726, + "rewards/fixed_code_pass_all_test_reward/std": 0.3230622112751007, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.570190001844678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.03278004564344883, + "learning_rate": 1.798752836082034e-05, + "loss": 0.0013, + "num_tokens": 25752421.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 213.625, + "completions/mean_terminated_length": 213.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.5703744696550452, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.044994804076850414, + "learning_rate": 1.798559070337135e-05, + "loss": 0.0018, + "num_tokens": 25757314.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 665.0, + "completions/mean_terminated_length": 665.0, + "completions/min_length": 566.0, + "completions/min_terminated_length": 566.0, + "epoch": 0.5705589374654123, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5546875, + "kl": 0.021130827895831317, + "learning_rate": 1.798365221802942e-05, + "loss": 0.0008, + "num_tokens": 25769530.0, + "reward": 1.3541667461395264, + "reward_std": 0.058925557881593704, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3541666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.058925557881593704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 217.875, + "completions/mean_terminated_length": 217.875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.5707434052757794, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.06319100176915526, + "learning_rate": 1.798171290499552e-05, + "loss": 0.0025, + "num_tokens": 25776569.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 275.625, + "completions/mean_terminated_length": 275.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5709278730861465, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.042994259390980005, + "learning_rate": 1.7979772764470708e-05, + "loss": 0.0017, + "num_tokens": 25785342.0, + "reward": 1.9659091234207153, + "reward_std": 0.09642363339662552, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.5711123408965135, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.03610195638611913, + "learning_rate": 1.797783179665612e-05, + "loss": 0.0014, + "num_tokens": 25791579.0, + "reward": 1.1750000715255737, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 330.0, + "completions/mean_terminated_length": 330.0, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.5712968087068806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.07988016516901553, + "learning_rate": 1.7975890001752987e-05, + "loss": 0.0032, + "num_tokens": 25800891.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 269.75, + "completions/mean_terminated_length": 269.75, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.5714812765172478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.0460772723890841, + "learning_rate": 1.7973947379962618e-05, + "loss": 0.0018, + "num_tokens": 25811073.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 237.625, + "completions/mean_terminated_length": 237.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.5716657443276149, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.030236029415391386, + "learning_rate": 1.797200393148641e-05, + "loss": 0.0012, + "num_tokens": 25816198.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.5718502121379819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.05689949379302561, + "learning_rate": 1.7970059656525853e-05, + "loss": 0.0023, + "num_tokens": 25821931.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.572034679948349, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.0509294094517827, + "learning_rate": 1.796811455528251e-05, + "loss": 0.002, + "num_tokens": 25830741.0, + "reward": 1.2870371341705322, + "reward_std": 0.06338176131248474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.28703704476356506, + "rewards/fixed_code_pass_all_test_reward/std": 0.06338173896074295, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 478.25, + "completions/mean_terminated_length": 478.25, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.5722191477587161, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.032214547507464886, + "learning_rate": 1.796616862795804e-05, + "loss": 0.0013, + "num_tokens": 25843247.0, + "reward": 1.2727272510528564, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 482.125, + "completions/mean_terminated_length": 482.125, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.5724036155690831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.030857123900204897, + "learning_rate": 1.796422187475418e-05, + "loss": 0.0012, + "num_tokens": 25852896.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 374.25, + "completions/mean_terminated_length": 374.25, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.5725880833794503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.06735233240760863, + "learning_rate": 1.7962274295872764e-05, + "loss": 0.0027, + "num_tokens": 25860578.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 393.5, + "completions/mean_terminated_length": 393.5, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5727725511898174, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.03646807977929711, + "learning_rate": 1.7960325891515695e-05, + "loss": 0.0015, + "num_tokens": 25871566.0, + "reward": 1.6477272510528564, + "reward_std": 0.32979726791381836, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.32979726791381836, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 166.125, + "completions/mean_terminated_length": 166.125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.5729570190001845, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8671875, + "kl": 0.08107712818309665, + "learning_rate": 1.7958376661884974e-05, + "loss": 0.0032, + "num_tokens": 25875751.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.5731414868105515, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.038643559673801064, + "learning_rate": 1.7956426607182687e-05, + "loss": 0.0015, + "num_tokens": 25885831.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 622.375, + "completions/mean_terminated_length": 622.375, + "completions/min_length": 508.0, + "completions/min_terminated_length": 508.0, + "epoch": 0.5733259546209186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73046875, + "kl": 0.03831643424928188, + "learning_rate": 1.7954475727611002e-05, + "loss": 0.0015, + "num_tokens": 25899962.0, + "reward": 1.9027777910232544, + "reward_std": 0.1384914666414261, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9027777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.13849148154258728, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 324.5, + "completions/mean_terminated_length": 324.5, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.5735104224312857, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.056681342888623476, + "learning_rate": 1.795252402337217e-05, + "loss": 0.0023, + "num_tokens": 25910014.0, + "reward": 1.4431817531585693, + "reward_std": 0.5464006662368774, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5681818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.19284729659557343, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 244.5, + "completions/mean_terminated_length": 244.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5736948902416529, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.050711131654679775, + "learning_rate": 1.7950571494668533e-05, + "loss": 0.002, + "num_tokens": 25918290.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 256.875, + "completions/mean_terminated_length": 256.875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.57387935805202, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.08169829286634922, + "learning_rate": 1.7948618141702516e-05, + "loss": 0.0033, + "num_tokens": 25927689.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 208.75, + "completions/mean_terminated_length": 208.75, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.574063825862387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.04632511222735047, + "learning_rate": 1.7946663964676626e-05, + "loss": 0.0019, + "num_tokens": 25936655.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 222.25, + "completions/mean_terminated_length": 222.25, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5742482936727541, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.028407818637788296, + "learning_rate": 1.7944708963793464e-05, + "loss": 0.0011, + "num_tokens": 25942313.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 225.625, + "completions/mean_terminated_length": 225.625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.5744327614831212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.06385754852090031, + "learning_rate": 1.794275313925571e-05, + "loss": 0.0026, + "num_tokens": 25947974.0, + "reward": 0.9798386693000793, + "reward_std": 0.4050878584384918, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10483870655298233, + "rewards/fixed_code_pass_all_test_reward/std": 0.09561517834663391, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.5746172292934882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057373046875, + "kl": 0.033226786530576646, + "learning_rate": 1.794079649126613e-05, + "loss": 0.0013, + "num_tokens": 25954792.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 408.625, + "completions/mean_terminated_length": 408.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5748016971038554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.05039692344143987, + "learning_rate": 1.793883902002758e-05, + "loss": 0.002, + "num_tokens": 25965725.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 210.75, + "completions/mean_terminated_length": 210.75, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.5749861649142225, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.06900247558951378, + "learning_rate": 1.7936880725742992e-05, + "loss": 0.0028, + "num_tokens": 25974115.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 252.0, + "completions/mean_terminated_length": 252.0, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.5751706327245896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.05080791120417416, + "learning_rate": 1.7934921608615393e-05, + "loss": 0.002, + "num_tokens": 25982715.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 359.75, + "completions/mean_terminated_length": 359.75, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.5753551005349566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.044273634208366275, + "learning_rate": 1.793296166884789e-05, + "loss": 0.0018, + "num_tokens": 25990217.0, + "reward": 1.9034091234207153, + "reward_std": 0.2732003331184387, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9034091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.2732003331184387, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 256.125, + "completions/mean_terminated_length": 256.125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.5755395683453237, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.030609617359004915, + "learning_rate": 1.7931000906643675e-05, + "loss": 0.0012, + "num_tokens": 25995810.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 263.75, + "completions/mean_terminated_length": 263.75, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5757240361556908, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0639483192935586, + "learning_rate": 1.7929039322206028e-05, + "loss": 0.0026, + "num_tokens": 26005224.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 241.375, + "completions/mean_terminated_length": 241.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.575908503966058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.05460383091121912, + "learning_rate": 1.792707691573831e-05, + "loss": 0.0022, + "num_tokens": 26010859.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.576092971776425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.04581896890886128, + "learning_rate": 1.792511368744398e-05, + "loss": 0.0018, + "num_tokens": 26016970.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 362.75, + "completions/mean_terminated_length": 362.75, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5762774395867921, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.05795079516246915, + "learning_rate": 1.7923149637526563e-05, + "loss": 0.0023, + "num_tokens": 26025456.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 205.0, + "completions/mean_terminated_length": 205.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5764619073971592, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.04561865923460573, + "learning_rate": 1.792118476618968e-05, + "loss": 0.0018, + "num_tokens": 26030280.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 464.0, + "completions/mean_terminated_length": 464.0, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.5766463752075263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.03982660220935941, + "learning_rate": 1.7919219073637038e-05, + "loss": 0.0016, + "num_tokens": 26039088.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 431.375, + "completions/mean_terminated_length": 431.375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.5768308430178933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.396484375, + "kl": 0.03529946773778647, + "learning_rate": 1.7917252560072426e-05, + "loss": 0.0014, + "num_tokens": 26047403.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.5770153108282605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.05018107523210347, + "learning_rate": 1.7915285225699718e-05, + "loss": 0.002, + "num_tokens": 26055881.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 390.875, + "completions/mean_terminated_length": 390.875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.5771997786386276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.038902438594959676, + "learning_rate": 1.791331707072288e-05, + "loss": 0.0016, + "num_tokens": 26067712.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.5773842464489947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.044972452567890286, + "learning_rate": 1.791134809534595e-05, + "loss": 0.0018, + "num_tokens": 26072675.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 354.0, + "completions/mean_terminated_length": 354.0, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5775687142593617, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.05014026677235961, + "learning_rate": 1.790937829977306e-05, + "loss": 0.002, + "num_tokens": 26079347.0, + "reward": 1.828125, + "reward_std": 0.2829807996749878, + "rewards/fixed_code_pass_all_test_reward/mean": 0.828125, + "rewards/fixed_code_pass_all_test_reward/std": 0.2829807996749878, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 262.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5777531820697288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1240234375, + "kl": 0.05651051318272948, + "learning_rate": 1.7907407684208424e-05, + "loss": 0.0023, + "num_tokens": 26088296.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 683.0, + "completions/mean_terminated_length": 683.0, + "completions/min_length": 657.0, + "completions/min_terminated_length": 657.0, + "epoch": 0.5779376498800959, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.028272128431126475, + "learning_rate": 1.790543624885635e-05, + "loss": 0.0011, + "num_tokens": 26104872.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 467.75, + "completions/mean_terminated_length": 467.75, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.5781221176904631, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.02935690840240568, + "learning_rate": 1.7903463993921214e-05, + "loss": 0.0012, + "num_tokens": 26113542.0, + "reward": 1.711111068725586, + "reward_std": 0.45184317231178284, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7111111283302307, + "rewards/fixed_code_pass_all_test_reward/std": 0.4518432021141052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.5783065855008301, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.041385607328265905, + "learning_rate": 1.790149091960749e-05, + "loss": 0.0017, + "num_tokens": 26122451.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 511.125, + "completions/mean_terminated_length": 511.125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.5784910533111972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8203125, + "kl": 0.03805990342516452, + "learning_rate": 1.7899517026119735e-05, + "loss": 0.0015, + "num_tokens": 26135020.0, + "reward": 1.638888955116272, + "reward_std": 0.029695741832256317, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6388888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.02969570830464363, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 362.5, + "completions/mean_terminated_length": 362.5, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.5786755211215643, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04869112162850797, + "learning_rate": 1.7897542313662586e-05, + "loss": 0.0019, + "num_tokens": 26142688.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 231.75, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.5788599889319314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.025036659091711044, + "learning_rate": 1.7895566782440768e-05, + "loss": 0.001, + "num_tokens": 26150918.0, + "reward": 1.040816307067871, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.040816325694322586, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 359.25, + "completions/mean_terminated_length": 359.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.5790444567422984, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.05975192226469517, + "learning_rate": 1.7893590432659093e-05, + "loss": 0.0024, + "num_tokens": 26159736.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 470.5, + "completions/mean_terminated_length": 470.5, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.5792289245526656, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.03821381670422852, + "learning_rate": 1.789161326452246e-05, + "loss": 0.0015, + "num_tokens": 26171164.0, + "reward": 1.4166666269302368, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 637.625, + "completions/mean_terminated_length": 637.625, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "epoch": 0.5794133923630327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.035475694574415684, + "learning_rate": 1.788963527823584e-05, + "loss": 0.0014, + "num_tokens": 26182193.0, + "reward": 1.7629311084747314, + "reward_std": 0.32735925912857056, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7629310488700867, + "rewards/fixed_code_pass_all_test_reward/std": 0.32735928893089294, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 146.0, + "completions/mean_terminated_length": 146.0, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.5795978601733998, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1904296875, + "kl": 0.0821554206777364, + "learning_rate": 1.78876564740043e-05, + "loss": 0.0033, + "num_tokens": 26186321.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 195.875, + "completions/mean_terminated_length": 195.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.5797823279837668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.0441164537332952, + "learning_rate": 1.788567685203299e-05, + "loss": 0.0018, + "num_tokens": 26190688.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 345.375, + "completions/mean_terminated_length": 345.375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.5799667957941339, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.05269231228157878, + "learning_rate": 1.7883696412527148e-05, + "loss": 0.0021, + "num_tokens": 26214275.0, + "reward": 1.9962348937988281, + "reward_std": 0.010649183765053749, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9962349534034729, + "rewards/fixed_code_pass_all_test_reward/std": 0.010649202391505241, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 460.0, + "completions/mean_terminated_length": 460.0, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.580151263604501, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.026075148256495595, + "learning_rate": 1.788171515569209e-05, + "loss": 0.001, + "num_tokens": 26222563.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 197.125, + "completions/mean_terminated_length": 197.125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.580335731414868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.04716695472598076, + "learning_rate": 1.7879733081733216e-05, + "loss": 0.0019, + "num_tokens": 26227012.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 467.125, + "completions/mean_terminated_length": 467.125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.5805201992252352, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.04029440716840327, + "learning_rate": 1.7877750190856022e-05, + "loss": 0.0016, + "num_tokens": 26238157.0, + "reward": 1.5854430198669434, + "reward_std": 0.40623417496681213, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7104430198669434, + "rewards/fixed_code_pass_all_test_reward/std": 0.11940551549196243, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 698.125, + "completions/mean_terminated_length": 698.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.5807046670356023, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.024738609325140715, + "learning_rate": 1.787576648326607e-05, + "loss": 0.001, + "num_tokens": 26253358.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 312.5, + "completions/mean_terminated_length": 312.5, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.5808891348459694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.04490569664631039, + "learning_rate": 1.7873781959169026e-05, + "loss": 0.0018, + "num_tokens": 26262338.0, + "reward": 1.9296875, + "reward_std": 0.19887378811836243, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9296875, + "rewards/fixed_code_pass_all_test_reward/std": 0.19887378811836243, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.5810736026563365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.04533425020053983, + "learning_rate": 1.7871796618770632e-05, + "loss": 0.0018, + "num_tokens": 26269568.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.5812580704667035, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0635696614626795, + "learning_rate": 1.7869810462276712e-05, + "loss": 0.0025, + "num_tokens": 26277903.0, + "reward": 1.9249999523162842, + "reward_std": 0.10350986570119858, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9249999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.1035098284482956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 511.75, + "completions/mean_terminated_length": 511.75, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.5814425382770706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060791015625, + "kl": 0.04051905055530369, + "learning_rate": 1.7867823489893175e-05, + "loss": 0.0016, + "num_tokens": 26287469.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 240.875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.5816270060874378, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.09875365742482245, + "learning_rate": 1.7865835701826025e-05, + "loss": 0.004, + "num_tokens": 26292364.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.5818114738978049, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.049717559479177, + "learning_rate": 1.7863847098281336e-05, + "loss": 0.002, + "num_tokens": 26304880.0, + "reward": 1.6989796161651611, + "reward_std": 0.13169653713703156, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6989796161651611, + "rewards/fixed_code_pass_all_test_reward/std": 0.13169650733470917, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 299.875, + "completions/mean_terminated_length": 299.875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.5819959417081719, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0691617710981518, + "learning_rate": 1.7861857679465275e-05, + "loss": 0.0028, + "num_tokens": 26312839.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 264.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.582180409518539, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.029023444862104952, + "learning_rate": 1.7859867445584092e-05, + "loss": 0.0012, + "num_tokens": 26318137.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 211.875, + "completions/mean_terminated_length": 211.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.5823648773289061, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.6875, + "kl": 0.09566710283979774, + "learning_rate": 1.7857876396844123e-05, + "loss": 0.0038, + "num_tokens": 26323008.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 489.75, + "completions/mean_terminated_length": 489.75, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.5825493451392731, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.04455329128541052, + "learning_rate": 1.7855884533451785e-05, + "loss": 0.0018, + "num_tokens": 26332438.0, + "reward": 1.7083333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 607.375, + "completions/mean_terminated_length": 607.375, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "epoch": 0.5827338129496403, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.60546875, + "kl": 0.03936332673765719, + "learning_rate": 1.7853891855613578e-05, + "loss": 0.0016, + "num_tokens": 26345865.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.5829182807600074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.07044606003910303, + "learning_rate": 1.7851898363536094e-05, + "loss": 0.0028, + "num_tokens": 26351505.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 239.375, + "completions/mean_terminated_length": 239.375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5831027485703745, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.098426828160882, + "learning_rate": 1.7849904057426006e-05, + "loss": 0.0039, + "num_tokens": 26357004.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.5832872163807415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.07178407348692417, + "learning_rate": 1.7847908937490067e-05, + "loss": 0.0029, + "num_tokens": 26361692.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 368.625, + "completions/mean_terminated_length": 368.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.5834716841911086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.376953125, + "kl": 0.05240453558508307, + "learning_rate": 1.784591300393512e-05, + "loss": 0.0021, + "num_tokens": 26370769.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 264.25, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.5836561520014757, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.06555538438260555, + "learning_rate": 1.7843916256968082e-05, + "loss": 0.0026, + "num_tokens": 26377651.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 468.75, + "completions/mean_terminated_length": 468.75, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5838406198118429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.02451432344969362, + "learning_rate": 1.7841918696795976e-05, + "loss": 0.001, + "num_tokens": 26387345.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 290.5, + "completions/mean_terminated_length": 290.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.58402508762221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.028775631450116634, + "learning_rate": 1.7839920323625888e-05, + "loss": 0.0012, + "num_tokens": 26392581.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 846.125, + "completions/mean_terminated_length": 846.125, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "epoch": 0.584209555432577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.482421875, + "kl": 0.01883645763155073, + "learning_rate": 1.7837921137665e-05, + "loss": 0.0008, + "num_tokens": 26409894.0, + "reward": 1.4166667461395264, + "reward_std": 0.6606874465942383, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 333.5, + "completions/mean_terminated_length": 333.5, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.5843940232429441, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.05124183953739703, + "learning_rate": 1.7835921139120568e-05, + "loss": 0.002, + "num_tokens": 26420322.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 134.875, + "completions/mean_terminated_length": 134.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.5845784910533112, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.52734375, + "kl": 0.12231877679005265, + "learning_rate": 1.783392032819994e-05, + "loss": 0.0049, + "num_tokens": 26424297.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 167.0, + "completions/mean_terminated_length": 167.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.5847629588636782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.052390101831406355, + "learning_rate": 1.7831918705110555e-05, + "loss": 0.0021, + "num_tokens": 26432985.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 319.875, + "completions/mean_terminated_length": 319.875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.5849474266740454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.0412308715749532, + "learning_rate": 1.782991627005992e-05, + "loss": 0.0016, + "num_tokens": 26441728.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.5851318944844125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1435546875, + "kl": 0.06305244262330234, + "learning_rate": 1.782791302325563e-05, + "loss": 0.0025, + "num_tokens": 26445910.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 287.5, + "completions/mean_terminated_length": 287.5, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.5853163622947796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.06389524089172482, + "learning_rate": 1.7825908964905378e-05, + "loss": 0.0026, + "num_tokens": 26454762.0, + "reward": 1.059999942779541, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.05999999865889549, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 406.625, + "completions/mean_terminated_length": 406.625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.5855008301051466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8203125, + "kl": 0.027664771070703864, + "learning_rate": 1.782390409521693e-05, + "loss": 0.0011, + "num_tokens": 26467255.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1778.0, + "completions/max_terminated_length": 1778.0, + "completions/mean_length": 575.375, + "completions/mean_terminated_length": 575.375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.5856852979155137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.025251635815948248, + "learning_rate": 1.7821898414398134e-05, + "loss": 0.001, + "num_tokens": 26476722.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 236.875, + "completions/mean_terminated_length": 236.875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.5858697657258808, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.028402433032169938, + "learning_rate": 1.7819891922656924e-05, + "loss": 0.0011, + "num_tokens": 26483737.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 719.0, + "completions/mean_terminated_length": 719.0, + "completions/min_length": 623.0, + "completions/min_terminated_length": 623.0, + "epoch": 0.586054233536248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6171875, + "kl": 0.021255666739307344, + "learning_rate": 1.7817884620201326e-05, + "loss": 0.0009, + "num_tokens": 26499049.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 320.0, + "completions/mean_terminated_length": 320.0, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.586238701346615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.048078659805469215, + "learning_rate": 1.7815876507239437e-05, + "loss": 0.0019, + "num_tokens": 26507977.0, + "reward": 1.587837815284729, + "reward_std": 0.06420189887285233, + "rewards/fixed_code_pass_all_test_reward/mean": 0.587837815284729, + "rewards/fixed_code_pass_all_test_reward/std": 0.06420188397169113, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 237.25, + "completions/mean_terminated_length": 237.25, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.5864231691569821, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.02891356567852199, + "learning_rate": 1.7813867583979454e-05, + "loss": 0.0012, + "num_tokens": 26517059.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 282.25, + "completions/mean_terminated_length": 282.25, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.5866076369673492, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.03221922158263624, + "learning_rate": 1.781185785062964e-05, + "loss": 0.0013, + "num_tokens": 26526917.0, + "reward": 1.3017240762710571, + "reward_std": 0.11173688620328903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3017241358757019, + "rewards/fixed_code_pass_all_test_reward/std": 0.11173690110445023, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.5867921047777163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.04003114975057542, + "learning_rate": 1.7809847307398352e-05, + "loss": 0.0016, + "num_tokens": 26531963.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 633.25, + "completions/mean_terminated_length": 431.14288330078125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.5869765725880833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4765625, + "kl": 0.027903419570066035, + "learning_rate": 1.7807835954494033e-05, + "loss": 0.0011, + "num_tokens": 26544885.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 280.625, + "completions/mean_terminated_length": 280.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.5871610403984505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.06207381980493665, + "learning_rate": 1.7805823792125206e-05, + "loss": 0.0025, + "num_tokens": 26553954.0, + "reward": 1.7904411554336548, + "reward_std": 0.39754486083984375, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7904411554336548, + "rewards/fixed_code_pass_all_test_reward/std": 0.39754483103752136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.5873455082088176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.03612837055698037, + "learning_rate": 1.7803810820500475e-05, + "loss": 0.0014, + "num_tokens": 26562666.0, + "reward": 1.8541666269302368, + "reward_std": 0.058925606310367584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8541666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 494.75, + "completions/mean_terminated_length": 494.75, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.5875299760191847, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78125, + "kl": 0.030450191698037088, + "learning_rate": 1.7801797039828534e-05, + "loss": 0.0012, + "num_tokens": 26571368.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5877144438295517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0322265625, + "kl": 0.02757272101007402, + "learning_rate": 1.7799782450318164e-05, + "loss": 0.0011, + "num_tokens": 26579849.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 343.0, + "completions/mean_terminated_length": 343.0, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5878989116399188, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.04705683677457273, + "learning_rate": 1.7797767052178216e-05, + "loss": 0.0019, + "num_tokens": 26587089.0, + "reward": 1.9464285373687744, + "reward_std": 0.1515229046344757, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.15152287483215332, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 342.75, + "completions/mean_terminated_length": 342.75, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5880833794502859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.037489519687369466, + "learning_rate": 1.7795750845617633e-05, + "loss": 0.0015, + "num_tokens": 26594543.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 315.125, + "completions/mean_terminated_length": 315.125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.5882678472606531, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.03123515483457595, + "learning_rate": 1.779373383084545e-05, + "loss": 0.0012, + "num_tokens": 26601712.0, + "reward": 1.8645833730697632, + "reward_std": 0.24372072517871857, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8645833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.24372074007987976, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.5884523150710201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.04426570073701441, + "learning_rate": 1.7791716008070772e-05, + "loss": 0.0018, + "num_tokens": 26610691.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 246.25, + "completions/mean_terminated_length": 246.25, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5886367828813872, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.020781970990356058, + "learning_rate": 1.7789697377502793e-05, + "loss": 0.0008, + "num_tokens": 26616445.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 253.625, + "completions/mean_terminated_length": 253.625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.5888212506917543, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27734375, + "kl": 0.08586529735475779, + "learning_rate": 1.778767793935079e-05, + "loss": 0.0034, + "num_tokens": 26622530.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 267.5, + "completions/mean_terminated_length": 267.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.5890057185021214, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.027313244179822505, + "learning_rate": 1.778565769382413e-05, + "loss": 0.0011, + "num_tokens": 26634358.0, + "reward": 1.3602941036224365, + "reward_std": 0.22366374731063843, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3602941334247589, + "rewards/fixed_code_pass_all_test_reward/std": 0.22366376221179962, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 261.75, + "completions/mean_terminated_length": 261.75, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.5891901863124884, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1328125, + "kl": 0.05368939717300236, + "learning_rate": 1.7783636641132257e-05, + "loss": 0.0021, + "num_tokens": 26642292.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 234.25, + "completions/mean_terminated_length": 234.25, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.5893746541228556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.04911674140021205, + "learning_rate": 1.7781614781484697e-05, + "loss": 0.002, + "num_tokens": 26651126.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 233.0, + "completions/mean_terminated_length": 233.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.5895591219332227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042236328125, + "kl": 0.031151547096669674, + "learning_rate": 1.7779592115091066e-05, + "loss": 0.0012, + "num_tokens": 26658758.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 281.75, + "completions/mean_terminated_length": 281.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5897435897435898, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.05404496344272047, + "learning_rate": 1.7777568642161058e-05, + "loss": 0.0022, + "num_tokens": 26669084.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 294.25, + "completions/mean_terminated_length": 294.25, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.5899280575539568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.0564572699368, + "learning_rate": 1.7775544362904456e-05, + "loss": 0.0023, + "num_tokens": 26676502.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 223.125, + "completions/mean_terminated_length": 223.125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.5901125253643239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.0726684033870697, + "learning_rate": 1.777351927753112e-05, + "loss": 0.0029, + "num_tokens": 26684007.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 228.0, + "completions/mean_terminated_length": 228.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.590296993174691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.158203125, + "kl": 0.05096497084014118, + "learning_rate": 1.7771493386251007e-05, + "loss": 0.002, + "num_tokens": 26692159.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 471.875, + "completions/mean_terminated_length": 471.875, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.5904814609850582, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.046799191273748875, + "learning_rate": 1.776946668927413e-05, + "loss": 0.0019, + "num_tokens": 26701598.0, + "reward": 1.7833333015441895, + "reward_std": 0.23162643611431122, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7833333015441895, + "rewards/fixed_code_pass_all_test_reward/std": 0.23162642121315002, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 671.5, + "completions/mean_terminated_length": 671.5, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "epoch": 0.5906659287954252, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.03589981538243592, + "learning_rate": 1.7767439186810628e-05, + "loss": 0.0014, + "num_tokens": 26717474.0, + "reward": 1.9791667461395264, + "reward_std": 0.017251623794436455, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.017251653596758842, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 255.125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.5908503966057923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201171875, + "kl": 0.053446023957803845, + "learning_rate": 1.7765410879070676e-05, + "loss": 0.0021, + "num_tokens": 26726083.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.5910348644161594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.0513851591385901, + "learning_rate": 1.7763381766264566e-05, + "loss": 0.0021, + "num_tokens": 26732065.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 339.625, + "completions/mean_terminated_length": 339.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.5912193322265265, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.07344930665567517, + "learning_rate": 1.7761351848602664e-05, + "loss": 0.0029, + "num_tokens": 26741958.0, + "reward": 1.850000023841858, + "reward_std": 0.2777460217475891, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.2777460217475891, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 323.875, + "completions/mean_terminated_length": 323.875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.5914038000368935, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.04352594749070704, + "learning_rate": 1.7759321126295413e-05, + "loss": 0.0017, + "num_tokens": 26752981.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.5915882678472607, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.03694290772546083, + "learning_rate": 1.7757289599553353e-05, + "loss": 0.0015, + "num_tokens": 26758395.0, + "reward": 1.899999976158142, + "reward_std": 0.2828426957130432, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 235.375, + "completions/mean_terminated_length": 235.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.5917727356576278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.028339531272649765, + "learning_rate": 1.7755257268587095e-05, + "loss": 0.0011, + "num_tokens": 26766158.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 299.5, + "completions/mean_terminated_length": 299.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5919572034679949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.05185215000528842, + "learning_rate": 1.7753224133607332e-05, + "loss": 0.0021, + "num_tokens": 26772098.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.0, + "completions/max_terminated_length": 667.0, + "completions/mean_length": 318.375, + "completions/mean_terminated_length": 318.375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.5921416712783619, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.06000466225668788, + "learning_rate": 1.7751190194824852e-05, + "loss": 0.0024, + "num_tokens": 26781813.0, + "reward": 1.744949460029602, + "reward_std": 0.4536868631839752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.744949460029602, + "rewards/fixed_code_pass_all_test_reward/std": 0.45368683338165283, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 400.25, + "completions/mean_terminated_length": 400.25, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.592326139088729, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.040871202014386654, + "learning_rate": 1.774915545245052e-05, + "loss": 0.0016, + "num_tokens": 26788655.0, + "reward": 1.5750000476837158, + "reward_std": 0.31052953004837036, + "rewards/fixed_code_pass_all_test_reward/mean": 0.574999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.31052953004837036, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.5925106068990961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.05978077370673418, + "learning_rate": 1.7747119906695282e-05, + "loss": 0.0024, + "num_tokens": 26794626.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 141.25, + "completions/mean_terminated_length": 141.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.5926950747094631, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.09818722493946552, + "learning_rate": 1.774508355777017e-05, + "loss": 0.0039, + "num_tokens": 26798676.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 340.0, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.5928795425198303, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.10449459427036345, + "learning_rate": 1.77430464058863e-05, + "loss": 0.0042, + "num_tokens": 26805852.0, + "reward": 1.7589285373687744, + "reward_std": 0.446785569190979, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8839285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.32829955220222473, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.5930640103301974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.052320567425340414, + "learning_rate": 1.774100845125488e-05, + "loss": 0.0021, + "num_tokens": 26810291.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 283.625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.5932484781405645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.059235129272565246, + "learning_rate": 1.7738969694087172e-05, + "loss": 0.0024, + "num_tokens": 26819080.0, + "reward": 1.7541667222976685, + "reward_std": 0.12321922928094864, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7541667222976685, + "rewards/fixed_code_pass_all_test_reward/std": 0.12321923673152924, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 178.375, + "completions/mean_terminated_length": 178.375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.5934329459509315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.04821228561922908, + "learning_rate": 1.7736930134594553e-05, + "loss": 0.0019, + "num_tokens": 26823315.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 350.5, + "completions/mean_terminated_length": 350.5, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.5936174137612986, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.04046152252703905, + "learning_rate": 1.7734889772988473e-05, + "loss": 0.0016, + "num_tokens": 26836471.0, + "reward": 1.6551724672317505, + "reward_std": 0.4235307574272156, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6551724076271057, + "rewards/fixed_code_pass_all_test_reward/std": 0.4235307276248932, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 419.875, + "completions/mean_terminated_length": 419.875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5938018815716657, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.051053815986961126, + "learning_rate": 1.7732848609480455e-05, + "loss": 0.002, + "num_tokens": 26849046.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 259.5, + "completions/mean_terminated_length": 259.5, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5939863493820329, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.049739392241463065, + "learning_rate": 1.773080664428212e-05, + "loss": 0.002, + "num_tokens": 26857050.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.5941708171924, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.09605036024004221, + "learning_rate": 1.7728763877605162e-05, + "loss": 0.0038, + "num_tokens": 26862366.0, + "reward": 1.5, + "reward_std": 0.20616242289543152, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.2061624377965927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.594355285002767, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.06136267213150859, + "learning_rate": 1.7726720309661363e-05, + "loss": 0.0025, + "num_tokens": 26868814.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 371.75, + "completions/mean_terminated_length": 371.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.5945397528131341, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.07893473096191883, + "learning_rate": 1.7724675940662585e-05, + "loss": 0.0032, + "num_tokens": 26880812.0, + "reward": 1.600806474685669, + "reward_std": 0.20103605091571808, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6008064150810242, + "rewards/fixed_code_pass_all_test_reward/std": 0.20103605091571808, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 351.75, + "completions/mean_terminated_length": 351.75, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5947242206235012, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.05931929824873805, + "learning_rate": 1.7722630770820776e-05, + "loss": 0.0024, + "num_tokens": 26889122.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 333.875, + "completions/mean_terminated_length": 333.875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5949086884338682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.07043121685273945, + "learning_rate": 1.7720584800347965e-05, + "loss": 0.0028, + "num_tokens": 26900305.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 334.125, + "completions/mean_terminated_length": 334.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.5950931562442354, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.08597835060209036, + "learning_rate": 1.7718538029456266e-05, + "loss": 0.0034, + "num_tokens": 26909730.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 506.875, + "completions/mean_terminated_length": 506.875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5952776240546025, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.049699308816343546, + "learning_rate": 1.7716490458357868e-05, + "loss": 0.002, + "num_tokens": 26922849.0, + "reward": 1.9872881174087524, + "reward_std": 0.035954609513282776, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9872881174087524, + "rewards/fixed_code_pass_all_test_reward/std": 0.035954590886831284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 385.125, + "completions/mean_terminated_length": 385.125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.5954620918649696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.0412621502764523, + "learning_rate": 1.7714442087265064e-05, + "loss": 0.0017, + "num_tokens": 26932810.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 422.125, + "completions/mean_terminated_length": 422.125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.5956465596753366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.02958440571092069, + "learning_rate": 1.77123929163902e-05, + "loss": 0.0012, + "num_tokens": 26944819.0, + "reward": 1.295454502105713, + "reward_std": 0.06428244709968567, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29545456171035767, + "rewards/fixed_code_pass_all_test_reward/std": 0.06428243964910507, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 296.875, + "completions/mean_terminated_length": 296.875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.5958310274857037, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.045489123091101646, + "learning_rate": 1.7710342945945725e-05, + "loss": 0.0018, + "num_tokens": 26951490.0, + "reward": 1.4375, + "reward_std": 0.2825268805027008, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2825268805027008, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 328.375, + "completions/mean_terminated_length": 328.375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.5960154952960708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90234375, + "kl": 0.040379425743594766, + "learning_rate": 1.7708292176144173e-05, + "loss": 0.0016, + "num_tokens": 26960525.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 415.5, + "completions/mean_terminated_length": 415.5, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.596199963106438, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.04919778439216316, + "learning_rate": 1.7706240607198143e-05, + "loss": 0.002, + "num_tokens": 26972201.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 145.25, + "completions/mean_terminated_length": 145.25, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.596384430916805, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.09180603828281164, + "learning_rate": 1.7704188239320343e-05, + "loss": 0.0037, + "num_tokens": 26976211.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 229.25, + "completions/mean_terminated_length": 229.25, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.5965688987271721, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662109375, + "kl": 0.020150380791164935, + "learning_rate": 1.7702135072723532e-05, + "loss": 0.0008, + "num_tokens": 26981749.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 427.75, + "completions/mean_terminated_length": 427.75, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.5967533665375392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.053825351991690695, + "learning_rate": 1.7700081107620582e-05, + "loss": 0.0022, + "num_tokens": 26991763.0, + "reward": 1.1363636255264282, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13636364042758942, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 469.625, + "completions/mean_terminated_length": 469.625, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.5969378343479063, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.04426286509260535, + "learning_rate": 1.7698026344224425e-05, + "loss": 0.0018, + "num_tokens": 27007272.0, + "reward": 1.90625, + "reward_std": 0.18600594997406006, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.18600596487522125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 226.25, + "completions/mean_terminated_length": 226.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.5971223021582733, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.04161699931137264, + "learning_rate": 1.7695970782748092e-05, + "loss": 0.0017, + "num_tokens": 27020794.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1103.0, + "completions/max_terminated_length": 1103.0, + "completions/mean_length": 491.375, + "completions/mean_terminated_length": 491.375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.5973067699686405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75390625, + "kl": 0.04408438364043832, + "learning_rate": 1.7693914423404687e-05, + "loss": 0.0018, + "num_tokens": 27030693.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 262.75, + "completions/mean_terminated_length": 262.75, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.5974912377790076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.058891989290714264, + "learning_rate": 1.7691857266407398e-05, + "loss": 0.0024, + "num_tokens": 27036475.0, + "reward": 1.649999976158142, + "reward_std": 0.3767024576663971, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6499999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.3767024874687195, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 170.5, + "completions/mean_terminated_length": 170.5, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.5976757055893747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.05321342567913234, + "learning_rate": 1.76897993119695e-05, + "loss": 0.0021, + "num_tokens": 27040519.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 182.375, + "completions/mean_terminated_length": 182.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.5978601733997417, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1396484375, + "kl": 0.13785012532025576, + "learning_rate": 1.7687740560304352e-05, + "loss": 0.0055, + "num_tokens": 27044954.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 193.5, + "completions/mean_terminated_length": 193.5, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.5980446412101088, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3125, + "kl": 0.3188199058640748, + "learning_rate": 1.7685681011625382e-05, + "loss": 0.0128, + "num_tokens": 27049350.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 569.875, + "completions/mean_terminated_length": 569.875, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 0.5982291090204759, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3671875, + "kl": 0.050047642551362514, + "learning_rate": 1.7683620666146116e-05, + "loss": 0.002, + "num_tokens": 27061109.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 172.0, + "completions/mean_terminated_length": 172.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.5984135768308431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46875, + "kl": 0.055553977843374014, + "learning_rate": 1.768155952408016e-05, + "loss": 0.0022, + "num_tokens": 27065253.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.5985980446412101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.05457189166918397, + "learning_rate": 1.7679497585641193e-05, + "loss": 0.0022, + "num_tokens": 27069636.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 346.125, + "completions/mean_terminated_length": 346.125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.5987825124515772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.05175937106832862, + "learning_rate": 1.7677434851042985e-05, + "loss": 0.0021, + "num_tokens": 27078277.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 100.125, + "completions/mean_terminated_length": 100.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.5989669802619443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.07112702820450068, + "learning_rate": 1.767537132049939e-05, + "loss": 0.0028, + "num_tokens": 27081798.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 446.125, + "completions/mean_terminated_length": 446.125, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.5991514480723114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.03650404326617718, + "learning_rate": 1.7673306994224335e-05, + "loss": 0.0015, + "num_tokens": 27090743.0, + "reward": 1.7678570747375488, + "reward_std": 0.3576526939868927, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7678571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.3576527237892151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 304.75, + "completions/mean_terminated_length": 304.75, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.5993359158826784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.045535961631685495, + "learning_rate": 1.767124187243184e-05, + "loss": 0.0018, + "num_tokens": 27097405.0, + "reward": 1.2127659320831299, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21276596188545227, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 236.875, + "completions/mean_terminated_length": 236.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.5995203836930456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.1333378052804619, + "learning_rate": 1.7669175955336008e-05, + "loss": 0.0053, + "num_tokens": 27102156.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 159.75, + "completions/mean_terminated_length": 159.75, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.5997048515034127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.06703867088072002, + "learning_rate": 1.7667109243151006e-05, + "loss": 0.0027, + "num_tokens": 27108330.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 214.125, + "completions/mean_terminated_length": 214.125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5998893193137798, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.028717777808196843, + "learning_rate": 1.766504173609111e-05, + "loss": 0.0011, + "num_tokens": 27113139.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 407.25, + "completions/mean_terminated_length": 407.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.6000737871241468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.03851071931421757, + "learning_rate": 1.766297343437066e-05, + "loss": 0.0015, + "num_tokens": 27120821.0, + "reward": 1.6764705181121826, + "reward_std": 0.05445994809269905, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6764706373214722, + "rewards/fixed_code_pass_all_test_reward/std": 0.05446000397205353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 278.625, + "completions/mean_terminated_length": 278.625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6002582549345139, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.060590768698602915, + "learning_rate": 1.7660904338204077e-05, + "loss": 0.0024, + "num_tokens": 27127274.0, + "reward": 1.8031914234161377, + "reward_std": 0.36441856622695923, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8031914830207825, + "rewards/fixed_code_pass_all_test_reward/std": 0.36441853642463684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 626.875, + "completions/mean_terminated_length": 626.875, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "epoch": 0.600442722744881, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.58203125, + "kl": 0.024945134646259248, + "learning_rate": 1.7658834447805886e-05, + "loss": 0.001, + "num_tokens": 27143665.0, + "reward": 1.9642857313156128, + "reward_std": 0.06612997502088547, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.06613000482320786, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 352.375, + "completions/mean_terminated_length": 352.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.6006271905552482, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.05722511187195778, + "learning_rate": 1.765676376339067e-05, + "loss": 0.0023, + "num_tokens": 27154068.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 293.25, + "completions/mean_terminated_length": 293.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.6008116583656152, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1943359375, + "kl": 0.03862951137125492, + "learning_rate": 1.7654692285173103e-05, + "loss": 0.0015, + "num_tokens": 27159534.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 284.375, + "completions/mean_terminated_length": 284.375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.6009961261759823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.05706696794368327, + "learning_rate": 1.7652620013367944e-05, + "loss": 0.0023, + "num_tokens": 27167953.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.6011805939863494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.056513707619160414, + "learning_rate": 1.7650546948190036e-05, + "loss": 0.0023, + "num_tokens": 27177382.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 186.375, + "completions/mean_terminated_length": 186.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.6013650617967164, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.03343645087443292, + "learning_rate": 1.7648473089854293e-05, + "loss": 0.0013, + "num_tokens": 27181753.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 239.875, + "completions/mean_terminated_length": 239.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6015495296070835, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.06353238178417087, + "learning_rate": 1.764639843857573e-05, + "loss": 0.0025, + "num_tokens": 27187848.0, + "reward": 1.774999976158142, + "reward_std": 0.4200340211391449, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.4200340509414673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 214.375, + "completions/mean_terminated_length": 214.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6017339974174507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.10659371362999082, + "learning_rate": 1.7644322994569424e-05, + "loss": 0.0043, + "num_tokens": 27196611.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 302.0, + "completions/mean_terminated_length": 302.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.6019184652278178, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.06496565137058496, + "learning_rate": 1.764224675805054e-05, + "loss": 0.0026, + "num_tokens": 27202859.0, + "reward": 1.7088816165924072, + "reward_std": 0.19910727441310883, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7088816165924072, + "rewards/fixed_code_pass_all_test_reward/std": 0.1991073042154312, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 409.75, + "completions/mean_terminated_length": 409.75, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.6021029330381849, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.07596419425681233, + "learning_rate": 1.764016972923434e-05, + "loss": 0.003, + "num_tokens": 27213065.0, + "reward": 1.8899999856948853, + "reward_std": 0.3111270070075989, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8899999856948853, + "rewards/fixed_code_pass_all_test_reward/std": 0.3111269772052765, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 338.25, + "completions/mean_terminated_length": 338.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.6022874008485519, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.06824757833965123, + "learning_rate": 1.763809190833615e-05, + "loss": 0.0027, + "num_tokens": 27222955.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 420.375, + "completions/mean_terminated_length": 420.375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.602471868658919, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.06158673297613859, + "learning_rate": 1.7636013295571382e-05, + "loss": 0.0025, + "num_tokens": 27234758.0, + "reward": 1.4722222089767456, + "reward_std": 0.6417293548583984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5972222089767456, + "rewards/fixed_code_pass_all_test_reward/std": 0.34085431694984436, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 359.25, + "completions/mean_terminated_length": 359.25, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.6026563364692861, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.026337135583162308, + "learning_rate": 1.7633933891155538e-05, + "loss": 0.0011, + "num_tokens": 27242280.0, + "reward": 1.4973958730697632, + "reward_std": 0.14323261380195618, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4973958432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.1432325690984726, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 585.75, + "completions/mean_terminated_length": 585.75, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.6028408042796533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.03041464532725513, + "learning_rate": 1.7631853695304194e-05, + "loss": 0.0012, + "num_tokens": 27253550.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 296.75, + "completions/mean_terminated_length": 296.75, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6030252720900203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.06288192025385797, + "learning_rate": 1.762977270823301e-05, + "loss": 0.0025, + "num_tokens": 27262964.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 557.5, + "completions/mean_terminated_length": 557.5, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.6032097399003874, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.03864532336592674, + "learning_rate": 1.762769093015773e-05, + "loss": 0.0015, + "num_tokens": 27276032.0, + "reward": 0.4166666865348816, + "reward_std": 0.7715167999267578, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, + "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 348.625, + "completions/mean_terminated_length": 348.625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.6033942077107545, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.04162590950727463, + "learning_rate": 1.7625608361294183e-05, + "loss": 0.0017, + "num_tokens": 27286709.0, + "reward": 1.85326087474823, + "reward_std": 0.2288147658109665, + "rewards/fixed_code_pass_all_test_reward/mean": 0.85326087474823, + "rewards/fixed_code_pass_all_test_reward/std": 0.22881478071212769, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 267.875, + "completions/mean_terminated_length": 267.875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6035786755211215, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.04989181808196008, + "learning_rate": 1.762352500185827e-05, + "loss": 0.002, + "num_tokens": 27296772.0, + "reward": 1.811274528503418, + "reward_std": 0.3732026517391205, + "rewards/fixed_code_pass_all_test_reward/mean": 0.811274528503418, + "rewards/fixed_code_pass_all_test_reward/std": 0.3732026517391205, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 563.125, + "completions/mean_terminated_length": 563.125, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.6037631433314886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.03178279101848602, + "learning_rate": 1.7621440852065986e-05, + "loss": 0.0013, + "num_tokens": 27307189.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 248.5, + "completions/mean_terminated_length": 248.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.6039476111418558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.07737881364300847, + "learning_rate": 1.7619355912133395e-05, + "loss": 0.0031, + "num_tokens": 27314537.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 257.875, + "completions/mean_terminated_length": 257.875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.6041320789522229, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.04525035014376044, + "learning_rate": 1.7617270182276655e-05, + "loss": 0.0018, + "num_tokens": 27320544.0, + "reward": 1.5257353782653809, + "reward_std": 0.25948992371559143, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5257353186607361, + "rewards/fixed_code_pass_all_test_reward/std": 0.25948992371559143, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 297.0, + "completions/mean_terminated_length": 297.0, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.60431654676259, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.201171875, + "kl": 0.06083514983765781, + "learning_rate": 1.7615183662711992e-05, + "loss": 0.0024, + "num_tokens": 27331200.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 247.75, + "completions/mean_terminated_length": 247.75, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.604501014572957, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.06710966257378459, + "learning_rate": 1.7613096353655735e-05, + "loss": 0.0027, + "num_tokens": 27337254.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 258.625, + "completions/mean_terminated_length": 258.625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6046854823833241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.03686751937493682, + "learning_rate": 1.7611008255324272e-05, + "loss": 0.0015, + "num_tokens": 27346163.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 405.75, + "completions/mean_terminated_length": 405.75, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.6048699501936912, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.06287900824099779, + "learning_rate": 1.760891936793409e-05, + "loss": 0.0025, + "num_tokens": 27356121.0, + "reward": 1.6336207389831543, + "reward_std": 0.21126903593540192, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6336207389831543, + "rewards/fixed_code_pass_all_test_reward/std": 0.21126903593540192, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 509.625, + "completions/mean_terminated_length": 289.8571472167969, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.6050544180040582, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.057863643509335816, + "learning_rate": 1.7606829691701746e-05, + "loss": 0.0023, + "num_tokens": 27368734.0, + "reward": 1.329545497894287, + "reward_std": 0.6325255036354065, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5795454382896423, + "rewards/fixed_code_pass_all_test_reward/std": 0.3746308982372284, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 236.25, + "completions/mean_terminated_length": 236.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.6052388858144254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.040455823531374335, + "learning_rate": 1.7604739226843884e-05, + "loss": 0.0016, + "num_tokens": 27376488.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 252.25, + "completions/mean_terminated_length": 252.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.6054233536247925, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.51171875, + "kl": 0.0893332022242248, + "learning_rate": 1.7602647973577235e-05, + "loss": 0.0036, + "num_tokens": 27384842.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 348.5, + "completions/mean_terminated_length": 348.5, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6056078214351596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0565935664344579, + "learning_rate": 1.7600555932118602e-05, + "loss": 0.0023, + "num_tokens": 27394950.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 340.375, + "completions/mean_terminated_length": 340.375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.6057922892455266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.05287824268452823, + "learning_rate": 1.7598463102684872e-05, + "loss": 0.0021, + "num_tokens": 27403081.0, + "reward": 1.4675325155258179, + "reward_std": 0.33320876955986023, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4675324559211731, + "rewards/fixed_code_pass_all_test_reward/std": 0.3332087993621826, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 401.5, + "completions/mean_terminated_length": 401.5, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.6059767570558937, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.02626150520518422, + "learning_rate": 1.7596369485493022e-05, + "loss": 0.0011, + "num_tokens": 27410733.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 316.125, + "completions/mean_terminated_length": 316.125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.6061612248662608, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.09091789182275534, + "learning_rate": 1.75942750807601e-05, + "loss": 0.0036, + "num_tokens": 27419222.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 330.375, + "completions/mean_terminated_length": 330.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.606345692676628, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.07641315320506692, + "learning_rate": 1.7592179888703234e-05, + "loss": 0.0031, + "num_tokens": 27432169.0, + "reward": 1.5969388484954834, + "reward_std": 0.36192142963409424, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7219387888908386, + "rewards/fixed_code_pass_all_test_reward/std": 0.18153518438339233, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 348.75, + "completions/mean_terminated_length": 348.75, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.606530160486995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.04969023144803941, + "learning_rate": 1.7590083909539655e-05, + "loss": 0.002, + "num_tokens": 27441831.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 294.625, + "completions/mean_terminated_length": 294.625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.6067146282973621, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.04731839359737933, + "learning_rate": 1.7587987143486645e-05, + "loss": 0.0019, + "num_tokens": 27448428.0, + "reward": 1.795212745666504, + "reward_std": 0.08274652063846588, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7952128052711487, + "rewards/fixed_code_pass_all_test_reward/std": 0.08274652808904648, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 256.875, + "completions/mean_terminated_length": 256.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6068990961077292, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.02953702607192099, + "learning_rate": 1.758588959076159e-05, + "loss": 0.0012, + "num_tokens": 27457955.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 276.625, + "completions/mean_terminated_length": 276.625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.6070835639180963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.02751934784464538, + "learning_rate": 1.7583791251581953e-05, + "loss": 0.0011, + "num_tokens": 27466432.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 325.125, + "completions/mean_terminated_length": 325.125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.6072680317284633, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.734375, + "kl": 0.0611198334954679, + "learning_rate": 1.758169212616527e-05, + "loss": 0.0024, + "num_tokens": 27476281.0, + "reward": 1.2553191184997559, + "reward_std": 0.09914592653512955, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25531914830207825, + "rewards/fixed_code_pass_all_test_reward/std": 0.09914593398571014, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 239.875, + "completions/mean_terminated_length": 239.875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.6074524995388305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.05263520427979529, + "learning_rate": 1.7579592214729166e-05, + "loss": 0.0021, + "num_tokens": 27485024.0, + "reward": 1.9393939971923828, + "reward_std": 0.17141982913017273, + "rewards/fixed_code_pass_all_test_reward/mean": 0.939393937587738, + "rewards/fixed_code_pass_all_test_reward/std": 0.17141982913017273, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.6076369673491976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.025855178595520556, + "learning_rate": 1.7577491517491345e-05, + "loss": 0.001, + "num_tokens": 27491426.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 184.75, + "completions/mean_terminated_length": 184.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6078214351595647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.06575867533683777, + "learning_rate": 1.7575390034669594e-05, + "loss": 0.0026, + "num_tokens": 27496008.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 251.25, + "completions/mean_terminated_length": 251.25, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.6080059029699317, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.05254435818642378, + "learning_rate": 1.7573287766481785e-05, + "loss": 0.0021, + "num_tokens": 27504210.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 239.75, + "completions/mean_terminated_length": 239.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.6081903707802988, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.04450291907414794, + "learning_rate": 1.7571184713145863e-05, + "loss": 0.0018, + "num_tokens": 27512144.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 388.0, + "completions/mean_terminated_length": 388.0, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.6083748385906659, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.04374199407175183, + "learning_rate": 1.7569080874879856e-05, + "loss": 0.0017, + "num_tokens": 27519968.0, + "reward": 1.2426470518112183, + "reward_std": 0.23621143400669098, + "rewards/fixed_code_pass_all_test_reward/mean": 0.36764705181121826, + "rewards/fixed_code_pass_all_test_reward/std": 0.24707883596420288, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 282.125, + "completions/mean_terminated_length": 282.125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.6085593064010331, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.03455950319766998, + "learning_rate": 1.756697625190188e-05, + "loss": 0.0014, + "num_tokens": 27526633.0, + "reward": 1.83695650100708, + "reward_std": 0.10063259303569794, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8369565010070801, + "rewards/fixed_code_pass_all_test_reward/std": 0.10063262283802032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 172.5, + "completions/mean_terminated_length": 172.5, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.6087437742114001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.029937791405245662, + "learning_rate": 1.756487084443013e-05, + "loss": 0.0012, + "num_tokens": 27530909.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 230.125, + "completions/mean_terminated_length": 230.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.6089282420217672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.0454005200881511, + "learning_rate": 1.7562764652682876e-05, + "loss": 0.0018, + "num_tokens": 27538278.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 295.125, + "completions/mean_terminated_length": 295.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6091127098321343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.0438581247581169, + "learning_rate": 1.7560657676878477e-05, + "loss": 0.0018, + "num_tokens": 27547943.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 185.875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.6092971776425014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.03327989799436182, + "learning_rate": 1.755854991723537e-05, + "loss": 0.0013, + "num_tokens": 27552782.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 304.125, + "completions/mean_terminated_length": 304.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.6094816454528684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23828125, + "kl": 0.04942752467468381, + "learning_rate": 1.7556441373972072e-05, + "loss": 0.002, + "num_tokens": 27564175.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 435.125, + "completions/mean_terminated_length": 435.125, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.6096661132632356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.024697610642760992, + "learning_rate": 1.7554332047307183e-05, + "loss": 0.001, + "num_tokens": 27573496.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 191.25, + "completions/mean_terminated_length": 191.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.6098505810736027, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.04457639798056334, + "learning_rate": 1.7552221937459385e-05, + "loss": 0.0018, + "num_tokens": 27578282.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 155.625, + "completions/mean_terminated_length": 155.625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.6100350488839698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.07007634686306119, + "learning_rate": 1.755011104464744e-05, + "loss": 0.0028, + "num_tokens": 27586295.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 493.625, + "completions/mean_terminated_length": 271.5714416503906, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.6102195166943368, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4453125, + "kl": 0.017332305782474577, + "learning_rate": 1.754799936909019e-05, + "loss": 0.0007, + "num_tokens": 27593684.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 290.625, + "completions/mean_terminated_length": 290.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.6104039845047039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.029942544177174568, + "learning_rate": 1.7545886911006563e-05, + "loss": 0.0012, + "num_tokens": 27603265.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 208.25, + "completions/mean_terminated_length": 208.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.610588452315071, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.328125, + "kl": 0.07714909967035055, + "learning_rate": 1.7543773670615557e-05, + "loss": 0.0031, + "num_tokens": 27610147.0, + "reward": 1.3256173133850098, + "reward_std": 0.2386571317911148, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3256172835826874, + "rewards/fixed_code_pass_all_test_reward/std": 0.2386571615934372, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 223.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.6107729201254382, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.10459733661264181, + "learning_rate": 1.754165964813627e-05, + "loss": 0.0042, + "num_tokens": 27617367.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.6109573879358052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.035625193966552615, + "learning_rate": 1.753954484378786e-05, + "loss": 0.0014, + "num_tokens": 27627240.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 185.375, + "completions/mean_terminated_length": 185.375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.6111418557461723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.02185181574895978, + "learning_rate": 1.753742925778958e-05, + "loss": 0.0009, + "num_tokens": 27631795.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 256.5, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.6113263235565394, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.06494635296985507, + "learning_rate": 1.7535312890360757e-05, + "loss": 0.0026, + "num_tokens": 27640519.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 497.25, + "completions/mean_terminated_length": 497.25, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "epoch": 0.6115107913669064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.04333417466841638, + "learning_rate": 1.753319574172081e-05, + "loss": 0.0017, + "num_tokens": 27650369.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.6116952591772735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.04373086185660213, + "learning_rate": 1.7531077812089222e-05, + "loss": 0.0017, + "num_tokens": 27659453.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 293.625, + "completions/mean_terminated_length": 293.625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.6118797269876407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.02060942817479372, + "learning_rate": 1.752895910168557e-05, + "loss": 0.0008, + "num_tokens": 27665242.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 218.5, + "completions/mean_terminated_length": 218.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.6120641947980078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.0870751915499568, + "learning_rate": 1.75268396107295e-05, + "loss": 0.0035, + "num_tokens": 27673462.0, + "reward": 1.0555555820465088, + "reward_std": 0.05939141660928726, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0555555559694767, + "rewards/fixed_code_pass_all_test_reward/std": 0.059391386806964874, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 126.875, + "completions/mean_terminated_length": 126.875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6122486626083748, + "frac_reward_zero_std": 0.0, + "grad_norm": 14.4375, + "kl": 0.08483859500847757, + "learning_rate": 1.752471933944076e-05, + "loss": 0.0034, + "num_tokens": 27677173.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.6124331304187419, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.061213011387735605, + "learning_rate": 1.752259828803916e-05, + "loss": 0.0024, + "num_tokens": 27681542.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 482.75, + "completions/mean_terminated_length": 482.75, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.612617598229109, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5625, + "kl": 0.02059119159821421, + "learning_rate": 1.752047645674459e-05, + "loss": 0.0008, + "num_tokens": 27694452.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 168.125, + "completions/mean_terminated_length": 168.125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.6128020660394761, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.040000207256525755, + "learning_rate": 1.7518353845777038e-05, + "loss": 0.0016, + "num_tokens": 27698613.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 236.25, + "completions/mean_terminated_length": 236.25, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.6129865338498433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.05184913636185229, + "learning_rate": 1.7516230455356556e-05, + "loss": 0.0021, + "num_tokens": 27707823.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 500.625, + "completions/mean_terminated_length": 279.5714416503906, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.6131710016602103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.64453125, + "kl": 0.04622071864287136, + "learning_rate": 1.7514106285703283e-05, + "loss": 0.0018, + "num_tokens": 27719348.0, + "reward": 1.3660714626312256, + "reward_std": 0.5917157530784607, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4910714328289032, + "rewards/fixed_code_pass_all_test_reward/std": 0.29124119877815247, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 216.75, + "completions/mean_terminated_length": 216.75, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6133554694705774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.07622893061488867, + "learning_rate": 1.751198133703744e-05, + "loss": 0.003, + "num_tokens": 27727626.0, + "reward": 1.798076868057251, + "reward_std": 0.3761429488658905, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7980769276618958, + "rewards/fixed_code_pass_all_test_reward/std": 0.3761429488658905, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.6135399372809445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26171875, + "kl": 0.07414495339617133, + "learning_rate": 1.7509855609579332e-05, + "loss": 0.003, + "num_tokens": 27731937.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 232.25, + "completions/mean_terminated_length": 232.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.6137244050913115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.02651358162984252, + "learning_rate": 1.7507729103549328e-05, + "loss": 0.0011, + "num_tokens": 27737899.0, + "reward": 1.9711538553237915, + "reward_std": 0.0815892145037651, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9711538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.0815892368555069, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 183.0, + "completions/mean_terminated_length": 183.0, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.6139088729016786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36328125, + "kl": 0.09215324791148305, + "learning_rate": 1.7505601819167904e-05, + "loss": 0.0037, + "num_tokens": 27745019.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 240.875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.6140933407120458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.04909712774679065, + "learning_rate": 1.7503473756655594e-05, + "loss": 0.002, + "num_tokens": 27753138.0, + "reward": 1.9764705896377563, + "reward_std": 0.06655122339725494, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9764705896377563, + "rewards/fixed_code_pass_all_test_reward/std": 0.06655122339725494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 422.375, + "completions/mean_terminated_length": 422.375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.6142778085224129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.031882311566732824, + "learning_rate": 1.7501344916233027e-05, + "loss": 0.0013, + "num_tokens": 27761893.0, + "reward": 1.8989362716674805, + "reward_std": 0.12854301929473877, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8989361524581909, + "rewards/fixed_code_pass_all_test_reward/std": 0.12854304909706116, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.6144622763327799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.06847328878939152, + "learning_rate": 1.74992152981209e-05, + "loss": 0.0027, + "num_tokens": 27769917.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 603.0, + "completions/mean_terminated_length": 603.0, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.614646744143147, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.05860861111432314, + "learning_rate": 1.7497084902540004e-05, + "loss": 0.0023, + "num_tokens": 27785789.0, + "reward": 1.1607142686843872, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 476.75, + "completions/mean_terminated_length": 476.75, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.6148312119535141, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.027779660536907613, + "learning_rate": 1.74949537297112e-05, + "loss": 0.0011, + "num_tokens": 27800011.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 228.25, + "completions/mean_terminated_length": 228.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.6150156797638812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7421875, + "kl": 0.09088262682780623, + "learning_rate": 1.7492821779855432e-05, + "loss": 0.0036, + "num_tokens": 27805797.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 331.125, + "completions/mean_terminated_length": 331.125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6152001475742483, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.03620972763746977, + "learning_rate": 1.7490689053193735e-05, + "loss": 0.0014, + "num_tokens": 27813038.0, + "reward": 1.584302306175232, + "reward_std": 0.13977688550949097, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5843023061752319, + "rewards/fixed_code_pass_all_test_reward/std": 0.13977693021297455, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 195.375, + "completions/mean_terminated_length": 195.375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.6153846153846154, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.028599119978025556, + "learning_rate": 1.7488555549947214e-05, + "loss": 0.0011, + "num_tokens": 27817353.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 240.875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.6155690831949825, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.05023747938685119, + "learning_rate": 1.7486421270337047e-05, + "loss": 0.002, + "num_tokens": 27823536.0, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 299.75, + "completions/mean_terminated_length": 299.75, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6157535510053496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.055596537655219436, + "learning_rate": 1.748428621458451e-05, + "loss": 0.0022, + "num_tokens": 27832094.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.6159380188157166, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.03635450708679855, + "learning_rate": 1.748215038291095e-05, + "loss": 0.0015, + "num_tokens": 27841650.0, + "reward": 1.875, + "reward_std": 0.05050760135054588, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 223.875, + "completions/mean_terminated_length": 223.875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.6161224866260837, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.07267401181161404, + "learning_rate": 1.7480013775537797e-05, + "loss": 0.0029, + "num_tokens": 27846993.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 101.0, + "completions/mean_terminated_length": 101.0, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.6163069544364509, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.03125, + "kl": 0.06279017543420196, + "learning_rate": 1.7477876392686557e-05, + "loss": 0.0025, + "num_tokens": 27850513.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 863.5, + "completions/mean_terminated_length": 863.5, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "epoch": 0.616491422246818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.57421875, + "kl": 0.031212733825668693, + "learning_rate": 1.7475738234578822e-05, + "loss": 0.0012, + "num_tokens": 27869229.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 251.375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.616675890057185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.04730022861622274, + "learning_rate": 1.747359930143626e-05, + "loss": 0.0019, + "num_tokens": 27878248.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 268.0, + "completions/mean_terminated_length": 268.0, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6168603578675521, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.03963503846898675, + "learning_rate": 1.7471459593480626e-05, + "loss": 0.0016, + "num_tokens": 27884544.0, + "reward": 1.9617347717285156, + "reward_std": 0.1082305982708931, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9617347121238708, + "rewards/fixed_code_pass_all_test_reward/std": 0.10823062062263489, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 481.125, + "completions/mean_terminated_length": 481.125, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.6170448256779192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.04432960296981037, + "learning_rate": 1.746931911093374e-05, + "loss": 0.0018, + "num_tokens": 27893785.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.6172292934882863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7734375, + "kl": 0.049275541212409735, + "learning_rate": 1.7467177854017528e-05, + "loss": 0.002, + "num_tokens": 27903111.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 411.875, + "completions/mean_terminated_length": 411.875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.6174137612986533, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.06680664815939963, + "learning_rate": 1.746503582295397e-05, + "loss": 0.0027, + "num_tokens": 27915590.0, + "reward": 1.7976189851760864, + "reward_std": 0.235702246427536, + "rewards/fixed_code_pass_all_test_reward/mean": 0.797619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.235702246427536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 186.875, + "completions/mean_terminated_length": 186.875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6175982291090205, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.03738026041537523, + "learning_rate": 1.7462893017965145e-05, + "loss": 0.0015, + "num_tokens": 27920581.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 166.25, + "completions/mean_terminated_length": 166.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.6177826969193876, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.390625, + "kl": 0.14535973500460386, + "learning_rate": 1.74607494392732e-05, + "loss": 0.0058, + "num_tokens": 27924783.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6179671647297547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.130859375, + "kl": 0.06586991040967405, + "learning_rate": 1.7458605087100364e-05, + "loss": 0.0026, + "num_tokens": 27932991.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 151.125, + "completions/mean_terminated_length": 151.125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.6181516325401217, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.11210122145712376, + "learning_rate": 1.7456459961668956e-05, + "loss": 0.0045, + "num_tokens": 27936984.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 290.25, + "completions/mean_terminated_length": 290.25, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6183361003504888, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8359375, + "kl": 0.06594277825206518, + "learning_rate": 1.745431406320136e-05, + "loss": 0.0026, + "num_tokens": 27948458.0, + "reward": 1.8509615659713745, + "reward_std": 0.3467257022857666, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9759615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.04568212106823921, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.6185205681608559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055908203125, + "kl": 0.043571528512984514, + "learning_rate": 1.7452167391920063e-05, + "loss": 0.0017, + "num_tokens": 27953710.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 269.5, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.6187050359712231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.03385118069127202, + "learning_rate": 1.7450019948047606e-05, + "loss": 0.0014, + "num_tokens": 27964442.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 144.625, + "completions/mean_terminated_length": 144.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.6188895037815901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1279296875, + "kl": 0.0467336589936167, + "learning_rate": 1.744787173180662e-05, + "loss": 0.0019, + "num_tokens": 27968295.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.6190739715919572, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.05532377725467086, + "learning_rate": 1.7445722743419826e-05, + "loss": 0.0022, + "num_tokens": 27975180.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 248.75, + "completions/mean_terminated_length": 248.75, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.6192584394023243, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.054185253102332354, + "learning_rate": 1.7443572983110014e-05, + "loss": 0.0022, + "num_tokens": 27981386.0, + "reward": 1.920454502105713, + "reward_std": 0.22498849034309387, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9204545617103577, + "rewards/fixed_code_pass_all_test_reward/std": 0.22498852014541626, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 290.375, + "completions/mean_terminated_length": 290.375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.6194429072126914, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.05819185171276331, + "learning_rate": 1.744142245110005e-05, + "loss": 0.0023, + "num_tokens": 27991565.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 242.0, + "completions/mean_terminated_length": 242.0, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6196273750230584, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.05045355367474258, + "learning_rate": 1.7439271147612897e-05, + "loss": 0.002, + "num_tokens": 28000981.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 346.375, + "completions/mean_terminated_length": 346.375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6198118428334256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0693359375, + "kl": 0.05596318026073277, + "learning_rate": 1.743711907287158e-05, + "loss": 0.0022, + "num_tokens": 28011064.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 257.5, + "completions/mean_terminated_length": 257.5, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.6199963106437927, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.06303555611521006, + "learning_rate": 1.743496622709922e-05, + "loss": 0.0025, + "num_tokens": 28017252.0, + "reward": 1.8815789222717285, + "reward_std": 0.33494532108306885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8815789222717285, + "rewards/fixed_code_pass_all_test_reward/std": 0.33494532108306885, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 256.875, + "completions/mean_terminated_length": 256.875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.6201807784541598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.05829335446469486, + "learning_rate": 1.7432812610519003e-05, + "loss": 0.0023, + "num_tokens": 28023619.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.6203652462645268, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.04785671178251505, + "learning_rate": 1.7430658223354203e-05, + "loss": 0.0019, + "num_tokens": 28031972.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 175.5, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.6205497140748939, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.0487944211345166, + "learning_rate": 1.7428503065828174e-05, + "loss": 0.002, + "num_tokens": 28040976.0, + "reward": 1.9246032238006592, + "reward_std": 0.2132544219493866, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9246031641960144, + "rewards/fixed_code_pass_all_test_reward/std": 0.2132544368505478, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 314.0, + "completions/mean_terminated_length": 314.0, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.620734181885261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.058223762549459934, + "learning_rate": 1.7426347138164346e-05, + "loss": 0.0023, + "num_tokens": 28047712.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 202.875, + "completions/mean_terminated_length": 202.875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.6209186496956282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.028028541710227728, + "learning_rate": 1.742419044058623e-05, + "loss": 0.0011, + "num_tokens": 28053271.0, + "reward": 1.975000023841858, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9750000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106739282608, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 964.625, + "completions/mean_terminated_length": 964.625, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "epoch": 0.6211031175059952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.388671875, + "kl": 0.02563648554496467, + "learning_rate": 1.742203297331743e-05, + "loss": 0.001, + "num_tokens": 28073100.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 325.625, + "completions/mean_terminated_length": 325.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.6212875853163623, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03543960826937109, + "learning_rate": 1.74198747365816e-05, + "loss": 0.0014, + "num_tokens": 28081649.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 292.25, + "completions/mean_terminated_length": 292.25, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.6214720531267294, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.0640694797039032, + "learning_rate": 1.7417715730602504e-05, + "loss": 0.0026, + "num_tokens": 28092163.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 252.0, + "completions/mean_terminated_length": 252.0, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6216565209370964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.0542169027030468, + "learning_rate": 1.741555595560397e-05, + "loss": 0.0022, + "num_tokens": 28101403.0, + "reward": 1.3125, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 170.75, + "completions/mean_terminated_length": 170.75, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.6218409887474635, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.052955642342567444, + "learning_rate": 1.7413395411809907e-05, + "loss": 0.0021, + "num_tokens": 28107425.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 220.0, + "completions/mean_terminated_length": 220.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.6220254565578307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.048725270200520754, + "learning_rate": 1.741123409944431e-05, + "loss": 0.0019, + "num_tokens": 28114953.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.6222099243681978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.061364800203591585, + "learning_rate": 1.7409072018731245e-05, + "loss": 0.0025, + "num_tokens": 28125601.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 115.375, + "completions/mean_terminated_length": 115.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.6223943921785648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1640625, + "kl": 0.06109807686880231, + "learning_rate": 1.7406909169894866e-05, + "loss": 0.0024, + "num_tokens": 28129180.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 238.75, + "completions/mean_terminated_length": 238.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.6225788599889319, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.045600482961162925, + "learning_rate": 1.74047455531594e-05, + "loss": 0.0018, + "num_tokens": 28137506.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.622763327799299, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.05583689140621573, + "learning_rate": 1.7402581168749156e-05, + "loss": 0.0022, + "num_tokens": 28149091.0, + "reward": 1.6749999523162842, + "reward_std": 0.46521881222724915, + "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.46521884202957153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 228.75, + "completions/mean_terminated_length": 228.75, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.6229477956096661, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.1007259413599968, + "learning_rate": 1.7400416016888527e-05, + "loss": 0.004, + "num_tokens": 28156849.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.6231322634200332, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.06840837420895696, + "learning_rate": 1.7398250097801977e-05, + "loss": 0.0027, + "num_tokens": 28165013.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 256.25, + "completions/mean_terminated_length": 256.25, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.6233167312304003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.05639171740040183, + "learning_rate": 1.7396083411714057e-05, + "loss": 0.0023, + "num_tokens": 28172927.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 458.0, + "completions/mean_terminated_length": 458.0, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.6235011990407674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.03930812678299844, + "learning_rate": 1.739391595884939e-05, + "loss": 0.0016, + "num_tokens": 28184119.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 406.5, + "completions/mean_terminated_length": 406.5, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6236856668511345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.02575731312390417, + "learning_rate": 1.7391747739432695e-05, + "loss": 0.001, + "num_tokens": 28192235.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 263.375, + "completions/mean_terminated_length": 263.375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.6238701346615015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.060396873159334064, + "learning_rate": 1.7389578753688744e-05, + "loss": 0.0024, + "num_tokens": 28199758.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 180.25, + "completions/mean_terminated_length": 180.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.6240546024718686, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.46875, + "kl": 0.06448884634301066, + "learning_rate": 1.738740900184241e-05, + "loss": 0.0026, + "num_tokens": 28204224.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 289.625, + "completions/mean_terminated_length": 289.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6242390702822358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.82421875, + "kl": 0.1459603374823928, + "learning_rate": 1.738523848411864e-05, + "loss": 0.0058, + "num_tokens": 28213373.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 338.625, + "completions/mean_terminated_length": 338.625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.6244235380926029, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.05459106923080981, + "learning_rate": 1.7383067200742454e-05, + "loss": 0.0022, + "num_tokens": 28222818.0, + "reward": 1.6875, + "reward_std": 0.45806270837783813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.45806270837783813, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 218.125, + "completions/mean_terminated_length": 218.125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6246080059029699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.04633820476010442, + "learning_rate": 1.7380895151938962e-05, + "loss": 0.0019, + "num_tokens": 28228475.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 239.5, + "completions/mean_terminated_length": 239.5, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.624792473713337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.05067318584769964, + "learning_rate": 1.7378722337933342e-05, + "loss": 0.002, + "num_tokens": 28237151.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 418.0, + "completions/mean_terminated_length": 418.0, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.6249769415237041, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.056313605047762394, + "learning_rate": 1.737654875895086e-05, + "loss": 0.0023, + "num_tokens": 28249191.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 440.125, + "completions/mean_terminated_length": 210.42857360839844, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.6251614093340712, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05960172868799418, + "learning_rate": 1.737437441521686e-05, + "loss": 0.0024, + "num_tokens": 28255888.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 289.375, + "completions/mean_terminated_length": 289.375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.6253458771444383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.060770762618631124, + "learning_rate": 1.7372199306956758e-05, + "loss": 0.0024, + "num_tokens": 28264651.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.6255303449548054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.055230563739314675, + "learning_rate": 1.7370023434396057e-05, + "loss": 0.0022, + "num_tokens": 28273953.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 303.0, + "completions/mean_terminated_length": 303.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.6257148127651725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.05630552442744374, + "learning_rate": 1.736784679776034e-05, + "loss": 0.0023, + "num_tokens": 28283985.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 413.5, + "completions/mean_terminated_length": 413.5, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.6258992805755396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.06325561599805951, + "learning_rate": 1.7365669397275265e-05, + "loss": 0.0025, + "num_tokens": 28292469.0, + "reward": 1.9500000476837158, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6260837483859066, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1611328125, + "kl": 0.04284806200303137, + "learning_rate": 1.7363491233166567e-05, + "loss": 0.0017, + "num_tokens": 28296634.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 283.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6262682161962737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.06358213676139712, + "learning_rate": 1.7361312305660064e-05, + "loss": 0.0025, + "num_tokens": 28306146.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 598.75, + "completions/mean_terminated_length": 598.75, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.6264526840066409, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76171875, + "kl": 0.03874512610491365, + "learning_rate": 1.735913261498166e-05, + "loss": 0.0015, + "num_tokens": 28316856.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 232.875, + "completions/mean_terminated_length": 232.875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.626637151817008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.04345501773059368, + "learning_rate": 1.7356952161357322e-05, + "loss": 0.0017, + "num_tokens": 28326335.0, + "reward": 1.2857142686843872, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 318.0, + "completions/mean_terminated_length": 318.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.626821619627375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.05715658771805465, + "learning_rate": 1.7354770945013107e-05, + "loss": 0.0023, + "num_tokens": 28333415.0, + "reward": 1.8571429252624512, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1258.0, + "completions/max_terminated_length": 1258.0, + "completions/mean_length": 917.875, + "completions/mean_terminated_length": 917.875, + "completions/min_length": 801.0, + "completions/min_terminated_length": 801.0, + "epoch": 0.6270060874377421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0286865234375, + "kl": 0.01515455375192687, + "learning_rate": 1.7352588966175156e-05, + "loss": 0.0006, + "num_tokens": 28349566.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 331.25, + "completions/mean_terminated_length": 331.25, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.6271905552481092, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2021484375, + "kl": 0.07053592288866639, + "learning_rate": 1.7350406225069674e-05, + "loss": 0.0028, + "num_tokens": 28360800.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 334.125, + "completions/mean_terminated_length": 334.125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.6273750230584763, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.06689757923595607, + "learning_rate": 1.7348222721922953e-05, + "loss": 0.0027, + "num_tokens": 28370057.0, + "reward": 1.5, + "reward_std": 0.7559289336204529, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 125.0, + "completions/mean_terminated_length": 125.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.6275594908688434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.05433904880192131, + "learning_rate": 1.734603845696137e-05, + "loss": 0.0022, + "num_tokens": 28373737.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 258.375, + "completions/mean_terminated_length": 258.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.6277439586792105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.02973150776233524, + "learning_rate": 1.734385343041137e-05, + "loss": 0.0012, + "num_tokens": 28383836.0, + "reward": 1.9488189220428467, + "reward_std": 0.061245452612638474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9488189220428467, + "rewards/fixed_code_pass_all_test_reward/std": 0.061245471239089966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 638.75, + "completions/mean_terminated_length": 638.75, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.6279284264895776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.027418581070378423, + "learning_rate": 1.7341667642499485e-05, + "loss": 0.0011, + "num_tokens": 28398314.0, + "reward": 1.192307710647583, + "reward_std": 0.15924587845802307, + "rewards/fixed_code_pass_all_test_reward/mean": 0.192307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.15924592316150665, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 255.375, + "completions/mean_terminated_length": 255.375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.6281128942999447, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.059361332561820745, + "learning_rate": 1.7339481093452325e-05, + "loss": 0.0024, + "num_tokens": 28404325.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 230.875, + "completions/mean_terminated_length": 230.875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.6282973621103117, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.05191280925646424, + "learning_rate": 1.7337293783496567e-05, + "loss": 0.0021, + "num_tokens": 28409844.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 279.625, + "completions/mean_terminated_length": 279.625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.6284818299206788, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.05258111469447613, + "learning_rate": 1.7335105712858988e-05, + "loss": 0.0021, + "num_tokens": 28417617.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 277.0, + "completions/mean_terminated_length": 277.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.628666297731046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26953125, + "kl": 0.07505911658518016, + "learning_rate": 1.7332916881766423e-05, + "loss": 0.003, + "num_tokens": 28423889.0, + "reward": 1.1538461446762085, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1538461595773697, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 388.625, + "completions/mean_terminated_length": 388.625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.6288507655414131, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.064131042920053, + "learning_rate": 1.7330727290445806e-05, + "loss": 0.0026, + "num_tokens": 28431214.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.6290352333517801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.05376318749040365, + "learning_rate": 1.732853693912413e-05, + "loss": 0.0022, + "num_tokens": 28441424.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 283.625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.6292197011621472, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.0251788446912542, + "learning_rate": 1.7326345828028483e-05, + "loss": 0.001, + "num_tokens": 28451165.0, + "reward": 1.5921052694320679, + "reward_std": 0.43605780601501465, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5921052694320679, + "rewards/fixed_code_pass_all_test_reward/std": 0.43605780601501465, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.6294041689725143, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.037185788969509304, + "learning_rate": 1.732415395738602e-05, + "loss": 0.0015, + "num_tokens": 28456582.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 335.875, + "completions/mean_terminated_length": 335.875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.6295886367828814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.04170013265684247, + "learning_rate": 1.732196132742398e-05, + "loss": 0.0017, + "num_tokens": 28463765.0, + "reward": 1.696428656578064, + "reward_std": 0.1937432587146759, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6964285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.1937432438135147, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 358.125, + "completions/mean_terminated_length": 358.125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6297731045932484, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.0426305690780282, + "learning_rate": 1.7319767938369682e-05, + "loss": 0.0017, + "num_tokens": 28471790.0, + "reward": 1.75, + "reward_std": 0.1478712558746338, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.14787118136882782, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 488.5, + "completions/mean_terminated_length": 488.5, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.6299575724036156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.03625028778333217, + "learning_rate": 1.731757379045052e-05, + "loss": 0.0015, + "num_tokens": 28481322.0, + "reward": 1.9500000476837158, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.6301420402139827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.06179301766678691, + "learning_rate": 1.731537888389397e-05, + "loss": 0.0025, + "num_tokens": 28491613.0, + "reward": 1.451923131942749, + "reward_std": 0.8971284627914429, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7019230723381042, + "rewards/fixed_code_pass_all_test_reward/std": 0.4352640211582184, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 431.75, + "completions/mean_terminated_length": 431.75, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.6303265080243498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.03763705282472074, + "learning_rate": 1.731318321892759e-05, + "loss": 0.0015, + "num_tokens": 28499507.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 365.5, + "completions/mean_terminated_length": 365.5, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.6305109758347168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.04317702213302255, + "learning_rate": 1.7310986795779006e-05, + "loss": 0.0017, + "num_tokens": 28507519.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 400.875, + "completions/mean_terminated_length": 400.875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6306954436450839, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.06232546176761389, + "learning_rate": 1.7308789614675926e-05, + "loss": 0.0025, + "num_tokens": 28518518.0, + "reward": 1.1818182468414307, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1818181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 476.5, + "completions/mean_terminated_length": 476.5, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.630879911455451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.04734708252362907, + "learning_rate": 1.7306591675846145e-05, + "loss": 0.0019, + "num_tokens": 28529506.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 166.5, + "completions/mean_terminated_length": 166.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.6310643792658182, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1904296875, + "kl": 0.07584367087110877, + "learning_rate": 1.730439297951753e-05, + "loss": 0.003, + "num_tokens": 28533718.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 261.375, + "completions/mean_terminated_length": 261.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.6312488470761852, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.047650488559156656, + "learning_rate": 1.7302193525918026e-05, + "loss": 0.0019, + "num_tokens": 28540089.0, + "reward": 1.732758641242981, + "reward_std": 0.09353125840425491, + "rewards/fixed_code_pass_all_test_reward/mean": 0.732758641242981, + "rewards/fixed_code_pass_all_test_reward/std": 0.09353122860193253, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.6314333148865523, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.04081716435030103, + "learning_rate": 1.7299993315275656e-05, + "loss": 0.0016, + "num_tokens": 28544560.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 324.75, + "completions/mean_terminated_length": 324.75, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.6316177826969194, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.03762569115497172, + "learning_rate": 1.7297792347818525e-05, + "loss": 0.0015, + "num_tokens": 28551054.0, + "reward": 1.5, + "reward_std": 0.3831780254840851, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.3831780254840851, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 259.625, + "completions/mean_terminated_length": 259.625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.6318022505072864, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.5, + "kl": 0.13949679117649794, + "learning_rate": 1.7295590623774815e-05, + "loss": 0.0056, + "num_tokens": 28560267.0, + "reward": 1.796875, + "reward_std": 0.3892385959625244, + "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 394.0, + "completions/mean_terminated_length": 394.0, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.6319867183176535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.033099250635132194, + "learning_rate": 1.7293388143372788e-05, + "loss": 0.0013, + "num_tokens": 28570507.0, + "reward": 1.9886363744735718, + "reward_std": 0.03214118629693985, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9886363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 191.5, + "completions/mean_terminated_length": 191.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.6321711861280207, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.07750570075586438, + "learning_rate": 1.7291184906840776e-05, + "loss": 0.0031, + "num_tokens": 28574895.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 335.25, + "completions/mean_terminated_length": 335.25, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.6323556539383878, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.07906265323981643, + "learning_rate": 1.7288980914407203e-05, + "loss": 0.0032, + "num_tokens": 28583545.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 248.875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6325401217487548, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.04161822376772761, + "learning_rate": 1.728677616630056e-05, + "loss": 0.0017, + "num_tokens": 28593432.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 215.25, + "completions/mean_terminated_length": 215.25, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.6327245895591219, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.0615637784358114, + "learning_rate": 1.7284570662749422e-05, + "loss": 0.0025, + "num_tokens": 28598650.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 241.125, + "completions/mean_terminated_length": 241.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.632909057369489, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.056954541709274054, + "learning_rate": 1.728236440398244e-05, + "loss": 0.0023, + "num_tokens": 28607219.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 238.125, + "completions/mean_terminated_length": 238.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.6330935251798561, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9765625, + "kl": 0.07060477905906737, + "learning_rate": 1.728015739022835e-05, + "loss": 0.0028, + "num_tokens": 28613516.0, + "reward": 1.9821429252624512, + "reward_std": 0.033064987510442734, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.03306501731276512, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 190.875, + "completions/mean_terminated_length": 190.875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.6332779929902232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.057190278079360723, + "learning_rate": 1.7277949621715953e-05, + "loss": 0.0023, + "num_tokens": 28617827.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 420.625, + "completions/mean_terminated_length": 420.625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.6334624608005903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.03602286125533283, + "learning_rate": 1.7275741098674138e-05, + "loss": 0.0014, + "num_tokens": 28628608.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 295.0, + "completions/mean_terminated_length": 295.0, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.6336469286109574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.06067067617550492, + "learning_rate": 1.727353182133187e-05, + "loss": 0.0024, + "num_tokens": 28635000.0, + "reward": 1.5843373537063599, + "reward_std": 0.44436201453208923, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5843373537063599, + "rewards/fixed_code_pass_all_test_reward/std": 0.444362074136734, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 265.125, + "completions/mean_terminated_length": 265.125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.6338313964213245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.056075247935950756, + "learning_rate": 1.727132178991819e-05, + "loss": 0.0022, + "num_tokens": 28644289.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 406.625, + "completions/mean_terminated_length": 406.625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.6340158642316915, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.06642380123957992, + "learning_rate": 1.7269111004662226e-05, + "loss": 0.0027, + "num_tokens": 28655998.0, + "reward": 1.3854166269302368, + "reward_std": 0.2517301142215729, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3854166567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.2517301142215729, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 341.875, + "completions/mean_terminated_length": 341.875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.6342003320420586, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.07010610355064273, + "learning_rate": 1.7266899465793168e-05, + "loss": 0.0028, + "num_tokens": 28665933.0, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 321.875, + "completions/mean_terminated_length": 321.875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.6343847998524258, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.03258367581292987, + "learning_rate": 1.7264687173540305e-05, + "loss": 0.0013, + "num_tokens": 28678188.0, + "reward": 1.4789916276931763, + "reward_std": 0.0950731709599495, + "rewards/fixed_code_pass_all_test_reward/mean": 0.47899162769317627, + "rewards/fixed_code_pass_all_test_reward/std": 0.09507319331169128, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 289.25, + "completions/mean_terminated_length": 289.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.6345692676627929, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.07312341034412384, + "learning_rate": 1.7262474128132983e-05, + "loss": 0.0029, + "num_tokens": 28685942.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 421.625, + "completions/mean_terminated_length": 421.625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.6347537354731599, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.04071953718084842, + "learning_rate": 1.7260260329800642e-05, + "loss": 0.0016, + "num_tokens": 28694187.0, + "reward": 1.46875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 244.25, + "completions/mean_terminated_length": 244.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.634938203283527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.03248983109369874, + "learning_rate": 1.725804577877279e-05, + "loss": 0.0013, + "num_tokens": 28699045.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 173.75, + "completions/mean_terminated_length": 173.75, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.6351226710938941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.06660315161570907, + "learning_rate": 1.725583047527902e-05, + "loss": 0.0027, + "num_tokens": 28703411.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 499.625, + "completions/mean_terminated_length": 499.625, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.6353071389042612, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7109375, + "kl": 0.019686337211169302, + "learning_rate": 1.7253614419548997e-05, + "loss": 0.0008, + "num_tokens": 28713024.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 355.0, + "completions/mean_terminated_length": 355.0, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6354916067146283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.023041862528771162, + "learning_rate": 1.725139761181247e-05, + "loss": 0.0009, + "num_tokens": 28723440.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 391.375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.6356760745249954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.228515625, + "kl": 0.0500498884357512, + "learning_rate": 1.7249180052299263e-05, + "loss": 0.002, + "num_tokens": 28731531.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 169.25, + "completions/mean_terminated_length": 169.25, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.6358605423353625, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.09902601689100266, + "learning_rate": 1.7246961741239273e-05, + "loss": 0.004, + "num_tokens": 28735701.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 417.0, + "completions/mean_terminated_length": 417.0, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.6360450101457296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.034463790827430785, + "learning_rate": 1.7244742678862483e-05, + "loss": 0.0014, + "num_tokens": 28744029.0, + "reward": 1.0110294818878174, + "reward_std": 0.03119589202105999, + "rewards/fixed_code_pass_all_test_reward/mean": 0.011029412038624287, + "rewards/fixed_code_pass_all_test_reward/std": 0.03119588829576969, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 238.375, + "completions/mean_terminated_length": 238.375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.6362294779560966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.059244331205263734, + "learning_rate": 1.724252286539895e-05, + "loss": 0.0024, + "num_tokens": 28749760.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 474.375, + "completions/mean_terminated_length": 474.375, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.6364139457664637, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.06603417498990893, + "learning_rate": 1.724030230107881e-05, + "loss": 0.0026, + "num_tokens": 28758699.0, + "reward": 1.461538553237915, + "reward_std": 0.39007121324539185, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4615384638309479, + "rewards/fixed_code_pass_all_test_reward/std": 0.39007121324539185, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 228.625, + "completions/mean_terminated_length": 228.625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.6365984135768309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.040396714583039284, + "learning_rate": 1.723808098613228e-05, + "loss": 0.0016, + "num_tokens": 28766768.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 520.5, + "completions/mean_terminated_length": 520.5, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.636782881387198, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.025033711805008352, + "learning_rate": 1.723585892078964e-05, + "loss": 0.001, + "num_tokens": 28776652.0, + "reward": 1.2916667461395264, + "reward_std": 0.33034372329711914, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.33034375309944153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 212.875, + "completions/mean_terminated_length": 212.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.636967349197565, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.09221207536756992, + "learning_rate": 1.723363610528127e-05, + "loss": 0.0037, + "num_tokens": 28781187.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 313.875, + "completions/mean_terminated_length": 313.875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6371518170079321, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.04959116643294692, + "learning_rate": 1.7231412539837616e-05, + "loss": 0.002, + "num_tokens": 28790194.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 261.0, + "completions/mean_terminated_length": 261.0, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.6373362848182992, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6640625, + "kl": 0.09752961248159409, + "learning_rate": 1.7229188224689196e-05, + "loss": 0.0039, + "num_tokens": 28797746.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 307.625, + "completions/mean_terminated_length": 307.625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.6375207526286663, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.05241863592527807, + "learning_rate": 1.722696316006662e-05, + "loss": 0.0021, + "num_tokens": 28804639.0, + "reward": 1.892241358757019, + "reward_std": 0.11575041711330414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.892241358757019, + "rewards/fixed_code_pass_all_test_reward/std": 0.11575044691562653, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 267.75, + "completions/mean_terminated_length": 267.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.6377052204390334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.04682483244687319, + "learning_rate": 1.722473734620056e-05, + "loss": 0.0019, + "num_tokens": 28813061.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 468.375, + "completions/mean_terminated_length": 468.375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.6378896882494005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.029011288657784462, + "learning_rate": 1.722251078332178e-05, + "loss": 0.0012, + "num_tokens": 28822136.0, + "reward": 1.6607142686843872, + "reward_std": 0.682720959186554, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.3967800438404083, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 362.625, + "completions/mean_terminated_length": 362.625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.6380741560597676, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.0540568835567683, + "learning_rate": 1.722028347166111e-05, + "loss": 0.0022, + "num_tokens": 28832957.0, + "reward": 1.014423131942749, + "reward_std": 0.04079460725188255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.014423076994717121, + "rewards/fixed_code_pass_all_test_reward/std": 0.04079462215304375, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 255.625, + "completions/mean_terminated_length": 255.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6382586238701347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.04164074675645679, + "learning_rate": 1.7218055411449467e-05, + "loss": 0.0017, + "num_tokens": 28838874.0, + "reward": 1.487069010734558, + "reward_std": 0.25167182087898254, + "rewards/fixed_code_pass_all_test_reward/mean": 0.48706895112991333, + "rewards/fixed_code_pass_all_test_reward/std": 0.25167182087898254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 804.375, + "completions/mean_terminated_length": 804.375, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "epoch": 0.6384430916805017, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7109375, + "kl": 0.036046633729711175, + "learning_rate": 1.7215826602917838e-05, + "loss": 0.0014, + "num_tokens": 28855845.0, + "reward": 1.8125, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 368.75, + "completions/mean_terminated_length": 368.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.6386275594908688, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1474609375, + "kl": 0.06934052752330899, + "learning_rate": 1.7213597046297295e-05, + "loss": 0.0028, + "num_tokens": 28865147.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 377.25, + "completions/mean_terminated_length": 377.25, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.638812027301236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.05190222733654082, + "learning_rate": 1.7211366741818976e-05, + "loss": 0.0021, + "num_tokens": 28872589.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 230.25, + "completions/mean_terminated_length": 230.25, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6389964951116031, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.02695427555590868, + "learning_rate": 1.7209135689714114e-05, + "loss": 0.0011, + "num_tokens": 28878271.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 368.375, + "completions/mean_terminated_length": 368.375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.6391809629219701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.03707165876403451, + "learning_rate": 1.7206903890214007e-05, + "loss": 0.0015, + "num_tokens": 28885210.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 148.375, + "completions/mean_terminated_length": 148.375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6393654307323372, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.765625, + "kl": 0.07017428148537874, + "learning_rate": 1.720467134355003e-05, + "loss": 0.0028, + "num_tokens": 28889133.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 170.0, + "completions/mean_terminated_length": 170.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.6395498985427043, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.03562236996367574, + "learning_rate": 1.720243804995364e-05, + "loss": 0.0014, + "num_tokens": 28893165.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.6397343663530713, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.031986344954930246, + "learning_rate": 1.7200204009656368e-05, + "loss": 0.0013, + "num_tokens": 28901670.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 372.75, + "completions/mean_terminated_length": 372.75, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.6399188341634385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.04303280997555703, + "learning_rate": 1.7197969222889827e-05, + "loss": 0.0017, + "num_tokens": 28911052.0, + "reward": 1.3830645084381104, + "reward_std": 0.5110456943511963, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5080645084381104, + "rewards/fixed_code_pass_all_test_reward/std": 0.5260425209999084, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 270.5, + "completions/mean_terminated_length": 270.5, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6401033019738056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.039954389445483685, + "learning_rate": 1.719573368988571e-05, + "loss": 0.0016, + "num_tokens": 28918336.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 229.5, + "completions/mean_terminated_length": 229.5, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.6402877697841727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.06670100055634975, + "learning_rate": 1.7193497410875766e-05, + "loss": 0.0027, + "num_tokens": 28923132.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 425.75, + "completions/mean_terminated_length": 425.75, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.6404722375945398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05126953125, + "kl": 0.03627040924038738, + "learning_rate": 1.7191260386091857e-05, + "loss": 0.0015, + "num_tokens": 28931386.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 150.875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.6406567054049068, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.34765625, + "kl": 0.044685350148938596, + "learning_rate": 1.718902261576589e-05, + "loss": 0.0018, + "num_tokens": 28935457.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 461.75, + "completions/mean_terminated_length": 461.75, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.6408411732152739, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.03351482772268355, + "learning_rate": 1.7186784100129864e-05, + "loss": 0.0013, + "num_tokens": 28944111.0, + "reward": 1.7946429252624512, + "reward_std": 0.3920446038246155, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7946428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.3920446038246155, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 352.125, + "completions/mean_terminated_length": 352.125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.6410256410256411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.03916339739225805, + "learning_rate": 1.718454483941586e-05, + "loss": 0.0016, + "num_tokens": 28953096.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 317.5, + "completions/mean_terminated_length": 317.5, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.6412101088360082, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.0762798935174942, + "learning_rate": 1.7182304833856026e-05, + "loss": 0.0031, + "num_tokens": 28959700.0, + "reward": 1.771505355834961, + "reward_std": 0.4230898320674896, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7715053558349609, + "rewards/fixed_code_pass_all_test_reward/std": 0.4230898320674896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 132.875, + "completions/mean_terminated_length": 132.875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6413945766463752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16015625, + "kl": 0.06368000456131995, + "learning_rate": 1.7180064083682583e-05, + "loss": 0.0025, + "num_tokens": 28963547.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 358.125, + "completions/mean_terminated_length": 358.125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.6415790444567423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.029705253487918526, + "learning_rate": 1.717782258912785e-05, + "loss": 0.0012, + "num_tokens": 28974716.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 545.5, + "completions/mean_terminated_length": 330.8571472167969, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.6417635122671094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.14663005527108908, + "learning_rate": 1.7175580350424202e-05, + "loss": 0.0059, + "num_tokens": 28981944.0, + "reward": 0.875, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 385.625, + "completions/mean_terminated_length": 385.625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.6419479800774764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033935546875, + "kl": 0.013653967762365937, + "learning_rate": 1.7173337367804103e-05, + "loss": 0.0005, + "num_tokens": 28988805.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 341.125, + "completions/mean_terminated_length": 341.125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.6421324478878435, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.046565545722842216, + "learning_rate": 1.7171093641500087e-05, + "loss": 0.0019, + "num_tokens": 28998150.0, + "reward": 1.25, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 470.375, + "completions/mean_terminated_length": 470.375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.6423169156982107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.04831159207969904, + "learning_rate": 1.716884917174477e-05, + "loss": 0.0019, + "num_tokens": 29009161.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 309.75, + "completions/mean_terminated_length": 309.75, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.6425013835085778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.05346830980852246, + "learning_rate": 1.7166603958770848e-05, + "loss": 0.0021, + "num_tokens": 29018471.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 253.75, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.6426858513189448, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.06737195048481226, + "learning_rate": 1.716435800281109e-05, + "loss": 0.0027, + "num_tokens": 29023989.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 209.5, + "completions/mean_terminated_length": 209.5, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6428703191293119, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.06831734557636082, + "learning_rate": 1.716211130409833e-05, + "loss": 0.0027, + "num_tokens": 29028441.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 160.25, + "completions/mean_terminated_length": 160.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.643054786939679, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9375, + "kl": 0.13315195124596357, + "learning_rate": 1.71598638628655e-05, + "loss": 0.0053, + "num_tokens": 29032595.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 342.75, + "completions/mean_terminated_length": 342.75, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.6432392547500461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.021956480981316417, + "learning_rate": 1.7157615679345602e-05, + "loss": 0.0009, + "num_tokens": 29039217.0, + "reward": 1.7374999523162842, + "reward_std": 0.25599944591522217, + "rewards/fixed_code_pass_all_test_reward/mean": 0.737500011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.25599944591522217, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6434237225604132, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.765625, + "kl": 0.2908115000464022, + "learning_rate": 1.7155366753771708e-05, + "loss": 0.0116, + "num_tokens": 29047842.0, + "reward": 1.125, + "reward_std": 0.05050764977931976, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 172.375, + "completions/mean_terminated_length": 172.375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.6436081903707803, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.07113793632015586, + "learning_rate": 1.7153117086376974e-05, + "loss": 0.0028, + "num_tokens": 29052141.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 326.25, + "completions/mean_terminated_length": 326.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.6437926581811474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.047252022195607424, + "learning_rate": 1.715086667739463e-05, + "loss": 0.0019, + "num_tokens": 29061671.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 429.375, + "completions/mean_terminated_length": 429.375, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.6439771259915145, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.04195108765270561, + "learning_rate": 1.714861552705798e-05, + "loss": 0.0017, + "num_tokens": 29072754.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 601.875, + "completions/mean_terminated_length": 601.875, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "epoch": 0.6441615938018815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.028670434840023518, + "learning_rate": 1.7146363635600413e-05, + "loss": 0.0011, + "num_tokens": 29083465.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 247.25, + "completions/mean_terminated_length": 247.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.6443460616122486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.04713781480677426, + "learning_rate": 1.7144111003255386e-05, + "loss": 0.0019, + "num_tokens": 29088323.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 526.5, + "completions/mean_terminated_length": 526.5, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.6445305294226158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.06541133904829621, + "learning_rate": 1.7141857630256442e-05, + "loss": 0.0026, + "num_tokens": 29099399.0, + "reward": 1.658046007156372, + "reward_std": 0.25195762515068054, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6580460071563721, + "rewards/fixed_code_pass_all_test_reward/std": 0.25195759534835815, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 156.0, + "completions/mean_terminated_length": 156.0, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.6447149972329829, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.08131601242348552, + "learning_rate": 1.7139603516837193e-05, + "loss": 0.0033, + "num_tokens": 29103439.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 307.625, + "completions/mean_terminated_length": 307.625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.6448994650433499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033935546875, + "kl": 0.014463061816059053, + "learning_rate": 1.713734866323133e-05, + "loss": 0.0006, + "num_tokens": 29110348.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 369.5, + "completions/mean_terminated_length": 369.5, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.645083932853717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.045549745904281735, + "learning_rate": 1.7135093069672623e-05, + "loss": 0.0018, + "num_tokens": 29120064.0, + "reward": 1.8081896305084229, + "reward_std": 0.23103035986423492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8081896305084229, + "rewards/fixed_code_pass_all_test_reward/std": 0.23103035986423492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 550.75, + "completions/mean_terminated_length": 550.75, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.6452684006640841, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.034454753156751394, + "learning_rate": 1.713283673639491e-05, + "loss": 0.0014, + "num_tokens": 29129782.0, + "reward": 1.4722222089767456, + "reward_std": 0.051434475928545, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4722222089767456, + "rewards/fixed_code_pass_all_test_reward/std": 0.051434461027383804, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 710.25, + "completions/mean_terminated_length": 710.25, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "epoch": 0.6454528684744512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6796875, + "kl": 0.03268989152275026, + "learning_rate": 1.7130579663632124e-05, + "loss": 0.0013, + "num_tokens": 29145472.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 246.25, + "completions/mean_terminated_length": 246.25, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.6456373362848183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.026255927921738476, + "learning_rate": 1.7128321851618256e-05, + "loss": 0.0011, + "num_tokens": 29150602.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 136.0, + "completions/mean_terminated_length": 136.0, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.6458218040951854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.75, + "kl": 0.10595285682938993, + "learning_rate": 1.7126063300587382e-05, + "loss": 0.0042, + "num_tokens": 29154490.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 393.75, + "completions/mean_terminated_length": 393.75, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.6460062719055525, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.027833778643980622, + "learning_rate": 1.7123804010773657e-05, + "loss": 0.0011, + "num_tokens": 29162976.0, + "reward": 1.9249999523162842, + "reward_std": 0.2121320217847824, + "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 220.25, + "completions/mean_terminated_length": 220.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.6461907397159196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.042864460963755846, + "learning_rate": 1.7121543982411302e-05, + "loss": 0.0017, + "num_tokens": 29167594.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.6463752075262866, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.03370582510251552, + "learning_rate": 1.711928321573463e-05, + "loss": 0.0013, + "num_tokens": 29171972.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 347.125, + "completions/mean_terminated_length": 347.125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.6465596753366537, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.05031390301883221, + "learning_rate": 1.7117021710978016e-05, + "loss": 0.002, + "num_tokens": 29178613.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 245.125, + "completions/mean_terminated_length": 245.125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.6467441431470209, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9609375, + "kl": 0.13079465553164482, + "learning_rate": 1.711475946837592e-05, + "loss": 0.0052, + "num_tokens": 29185806.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 314.25, + "completions/mean_terminated_length": 314.25, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.646928610957388, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.04557510884478688, + "learning_rate": 1.711249648816288e-05, + "loss": 0.0018, + "num_tokens": 29193920.0, + "reward": 1.625, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 412.625, + "completions/mean_terminated_length": 412.625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.647113078767755, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.0454241493716836, + "learning_rate": 1.7110232770573497e-05, + "loss": 0.0018, + "num_tokens": 29205317.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6472975465781221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.061412982642650604, + "learning_rate": 1.7107968315842467e-05, + "loss": 0.0025, + "num_tokens": 29211798.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 315.875, + "completions/mean_terminated_length": 315.875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.6474820143884892, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.059985117986798286, + "learning_rate": 1.710570312420455e-05, + "loss": 0.0024, + "num_tokens": 29219981.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 259.875, + "completions/mean_terminated_length": 259.875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.6476664821988563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.04265649407170713, + "learning_rate": 1.7103437195894588e-05, + "loss": 0.0017, + "num_tokens": 29225572.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 298.125, + "completions/mean_terminated_length": 298.125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.6478509500092234, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.05274852621369064, + "learning_rate": 1.7101170531147496e-05, + "loss": 0.0021, + "num_tokens": 29232037.0, + "reward": 1.0500000715255737, + "reward_std": 0.4375254809856415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.12817399203777313, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 391.0, + "completions/mean_terminated_length": 391.0, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.6480354178195905, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.04025741631630808, + "learning_rate": 1.709890313019827e-05, + "loss": 0.0016, + "num_tokens": 29242501.0, + "reward": 1.7000000476837158, + "reward_std": 0.32071352005004883, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.32071349024772644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 512.25, + "completions/mean_terminated_length": 512.25, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.6482198856299576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.044669343158602715, + "learning_rate": 1.7096634993281972e-05, + "loss": 0.0018, + "num_tokens": 29252159.0, + "reward": 1.3958333730697632, + "reward_std": 0.12400396168231964, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3958333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.12400396913290024, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 204.0, + "completions/mean_terminated_length": 204.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.6484043534403247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.35546875, + "kl": 0.059085462940856814, + "learning_rate": 1.709436612063375e-05, + "loss": 0.0024, + "num_tokens": 29256631.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 305.375, + "completions/mean_terminated_length": 305.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.6485888212506917, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.05148002109490335, + "learning_rate": 1.709209651248883e-05, + "loss": 0.0021, + "num_tokens": 29266018.0, + "reward": 1.5478723049163818, + "reward_std": 0.7314721345901489, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6728723049163818, + "rewards/fixed_code_pass_all_test_reward/std": 0.46669283509254456, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 227.125, + "completions/mean_terminated_length": 227.125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.6487732890610588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.031050573103129864, + "learning_rate": 1.7089826169082506e-05, + "loss": 0.0012, + "num_tokens": 29270731.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 512.625, + "completions/mean_terminated_length": 512.625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.648957756871426, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.05682907020673156, + "learning_rate": 1.7087555090650153e-05, + "loss": 0.0023, + "num_tokens": 29279624.0, + "reward": 1.6800000667572021, + "reward_std": 0.2892354428768158, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6800000071525574, + "rewards/fixed_code_pass_all_test_reward/std": 0.2892354726791382, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 392.25, + "completions/mean_terminated_length": 392.25, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.6491422246817931, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.03782501083333045, + "learning_rate": 1.708528327742722e-05, + "loss": 0.0015, + "num_tokens": 29289570.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.6493266924921601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.04337457357905805, + "learning_rate": 1.7083010729649237e-05, + "loss": 0.0017, + "num_tokens": 29298080.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 212.375, + "completions/mean_terminated_length": 212.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.6495111603025272, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.03730009822174907, + "learning_rate": 1.7080737447551804e-05, + "loss": 0.0015, + "num_tokens": 29302803.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 490.5, + "completions/mean_terminated_length": 490.5, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6496956281128943, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.0712610399350524, + "learning_rate": 1.7078463431370598e-05, + "loss": 0.0029, + "num_tokens": 29313143.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 440.75, + "completions/mean_terminated_length": 440.75, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.6498800959232613, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.044433149974793196, + "learning_rate": 1.7076188681341378e-05, + "loss": 0.0018, + "num_tokens": 29324869.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 278.25, + "completions/mean_terminated_length": 278.25, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.6500645637336285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.03620890202000737, + "learning_rate": 1.707391319769997e-05, + "loss": 0.0014, + "num_tokens": 29331767.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 203.875, + "completions/mean_terminated_length": 203.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.6502490315439956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.02555637143086642, + "learning_rate": 1.7071636980682288e-05, + "loss": 0.001, + "num_tokens": 29336134.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 329.0, + "completions/mean_terminated_length": 329.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.6504334993543627, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.0402876086300239, + "learning_rate": 1.706936003052431e-05, + "loss": 0.0016, + "num_tokens": 29345094.0, + "reward": 1.0083333253860474, + "reward_std": 0.0235702283680439, + "rewards/fixed_code_pass_all_test_reward/mean": 0.008333333767950535, + "rewards/fixed_code_pass_all_test_reward/std": 0.0235702283680439, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 464.625, + "completions/mean_terminated_length": 464.625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.6506179671647297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.044863255927339196, + "learning_rate": 1.7067082347462095e-05, + "loss": 0.0018, + "num_tokens": 29353947.0, + "reward": 1.567307710647583, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 653.125, + "completions/mean_terminated_length": 653.125, + "completions/min_length": 536.0, + "completions/min_terminated_length": 536.0, + "epoch": 0.6508024349750968, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.027257801266387105, + "learning_rate": 1.7064803931731778e-05, + "loss": 0.0011, + "num_tokens": 29369188.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 461.25, + "completions/mean_terminated_length": 461.25, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.6509869027854639, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.05260556470602751, + "learning_rate": 1.7062524783569573e-05, + "loss": 0.0021, + "num_tokens": 29381398.0, + "reward": 1.5, + "reward_std": 0.5629958510398865, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 320.375, + "completions/mean_terminated_length": 320.375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6511713705958311, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.055208828300237656, + "learning_rate": 1.7060244903211764e-05, + "loss": 0.0022, + "num_tokens": 29388033.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 389.125, + "completions/mean_terminated_length": 389.125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.6513558384061982, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.1010616400744766, + "learning_rate": 1.705796429089472e-05, + "loss": 0.004, + "num_tokens": 29395634.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.6515403062165652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08642578125, + "kl": 0.05227800470311195, + "learning_rate": 1.705568294685487e-05, + "loss": 0.0021, + "num_tokens": 29400273.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 458.875, + "completions/mean_terminated_length": 458.875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6517247740269323, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.02697537071071565, + "learning_rate": 1.705340087132873e-05, + "loss": 0.0011, + "num_tokens": 29408688.0, + "reward": 1.7272727489471436, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1010.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 646.375, + "completions/mean_terminated_length": 646.375, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.6519092418372994, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1142578125, + "kl": 0.043093855725601315, + "learning_rate": 1.70511180645529e-05, + "loss": 0.0017, + "num_tokens": 29420851.0, + "reward": 1.8181817531585693, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8181818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 277.0, + "completions/mean_terminated_length": 277.0, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.6520937096476664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.01608935120748356, + "learning_rate": 1.7048834526764035e-05, + "loss": 0.0006, + "num_tokens": 29426723.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 344.5, + "completions/mean_terminated_length": 344.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.6522781774580336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.05386209720745683, + "learning_rate": 1.7046550258198885e-05, + "loss": 0.0022, + "num_tokens": 29437063.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 533.75, + "completions/mean_terminated_length": 533.75, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.6524626452684007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.04106337716802955, + "learning_rate": 1.7044265259094263e-05, + "loss": 0.0016, + "num_tokens": 29448269.0, + "reward": 1.29296875, + "reward_std": 0.452219694852829, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29296875, + "rewards/fixed_code_pass_all_test_reward/std": 0.452219694852829, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 334.25, + "completions/mean_terminated_length": 334.25, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6526471130787678, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.028331871260888875, + "learning_rate": 1.7041979529687063e-05, + "loss": 0.0011, + "num_tokens": 29457167.0, + "reward": 1.851190447807312, + "reward_std": 0.12322601675987244, + "rewards/fixed_code_pass_all_test_reward/mean": 0.851190447807312, + "rewards/fixed_code_pass_all_test_reward/std": 0.12322598695755005, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1092.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 780.125, + "completions/mean_terminated_length": 780.125, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "epoch": 0.6528315808891348, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.55859375, + "kl": 0.02169166275416501, + "learning_rate": 1.7039693070214257e-05, + "loss": 0.0009, + "num_tokens": 29478680.0, + "reward": 1.8709677457809448, + "reward_std": 0.35210511088371277, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8709677457809448, + "rewards/fixed_code_pass_all_test_reward/std": 0.35210511088371277, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 172.5, + "completions/mean_terminated_length": 172.5, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.6530160486995019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.040681323735043406, + "learning_rate": 1.7037405880912887e-05, + "loss": 0.0016, + "num_tokens": 29482788.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 474.625, + "completions/mean_terminated_length": 474.625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.653200516509869, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.03449256962630898, + "learning_rate": 1.7035117962020074e-05, + "loss": 0.0014, + "num_tokens": 29493545.0, + "reward": 1.9621212482452393, + "reward_std": 0.10713736712932587, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9621212482452393, + "rewards/fixed_code_pass_all_test_reward/std": 0.10713739693164825, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.6533849843202362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.044530095998197794, + "learning_rate": 1.7032829313773018e-05, + "loss": 0.0018, + "num_tokens": 29500202.0, + "reward": 1.389423131942749, + "reward_std": 0.15982544422149658, + "rewards/fixed_code_pass_all_test_reward/mean": 0.38942307233810425, + "rewards/fixed_code_pass_all_test_reward/std": 0.1598254293203354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 403.75, + "completions/mean_terminated_length": 403.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.6535694521306032, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2890625, + "kl": 0.08502795756794512, + "learning_rate": 1.7030539936408986e-05, + "loss": 0.0034, + "num_tokens": 29509808.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 595.0, + "completions/mean_terminated_length": 387.4285888671875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6537539199409703, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.498046875, + "kl": 0.023293629987165332, + "learning_rate": 1.7028249830165328e-05, + "loss": 0.0009, + "num_tokens": 29517872.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 494.25, + "completions/mean_terminated_length": 494.25, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.6539383877513374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28125, + "kl": 0.07382483687251806, + "learning_rate": 1.7025958995279466e-05, + "loss": 0.003, + "num_tokens": 29530434.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 535.875, + "completions/mean_terminated_length": 535.875, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.6541228555617045, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0224609375, + "kl": 0.009964551223674789, + "learning_rate": 1.70236674319889e-05, + "loss": 0.0004, + "num_tokens": 29539689.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 305.875, + "completions/mean_terminated_length": 305.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.6543073233720715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.0392616440076381, + "learning_rate": 1.7021375140531203e-05, + "loss": 0.0016, + "num_tokens": 29544960.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 135.0, + "completions/mean_terminated_length": 135.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.6544917911824386, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.08010514662601054, + "learning_rate": 1.7019082121144023e-05, + "loss": 0.0032, + "num_tokens": 29548888.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 421.25, + "completions/mean_terminated_length": 421.25, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.6546762589928058, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.04015236138366163, + "learning_rate": 1.7016788374065084e-05, + "loss": 0.0016, + "num_tokens": 29560538.0, + "reward": 1.9861111640930176, + "reward_std": 0.03928373008966446, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 147.75, + "completions/mean_terminated_length": 147.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.6548607268031729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.03754666610620916, + "learning_rate": 1.701449389953219e-05, + "loss": 0.0015, + "num_tokens": 29564600.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 451.5, + "completions/mean_terminated_length": 451.5, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.6550451946135399, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.04572094860486686, + "learning_rate": 1.701219869778322e-05, + "loss": 0.0018, + "num_tokens": 29572308.0, + "reward": 1.0367647409439087, + "reward_std": 0.3572911322116852, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1617647111415863, + "rewards/fixed_code_pass_all_test_reward/std": 0.08281682431697845, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 281.625, + "completions/mean_terminated_length": 281.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.655229662423907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10400390625, + "kl": 0.050917466869577765, + "learning_rate": 1.7009902769056117e-05, + "loss": 0.002, + "num_tokens": 29578393.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.6554141302342741, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.04505270323716104, + "learning_rate": 1.700760611358891e-05, + "loss": 0.0018, + "num_tokens": 29583152.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 308.375, + "completions/mean_terminated_length": 308.375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.6555985980446412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.031955579994246364, + "learning_rate": 1.7005308731619707e-05, + "loss": 0.0013, + "num_tokens": 29588707.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 426.5, + "completions/mean_terminated_length": 426.5, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.6557830658550083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76953125, + "kl": 0.03818586328998208, + "learning_rate": 1.7003010623386678e-05, + "loss": 0.0015, + "num_tokens": 29599247.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 314.875, + "completions/mean_terminated_length": 314.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.6559675336653754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038330078125, + "kl": 0.029397392878308892, + "learning_rate": 1.7000711789128082e-05, + "loss": 0.0012, + "num_tokens": 29609102.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 363.5, + "completions/mean_terminated_length": 363.5, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.6561520014757425, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.048595622880384326, + "learning_rate": 1.699841222908224e-05, + "loss": 0.0019, + "num_tokens": 29618058.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 611.875, + "completions/mean_terminated_length": 611.875, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.6563364692861096, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.030419808928854764, + "learning_rate": 1.6996111943487555e-05, + "loss": 0.0012, + "num_tokens": 29633521.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 355.875, + "completions/mean_terminated_length": 355.875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.6565209370964766, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.055334446020424366, + "learning_rate": 1.6993810932582513e-05, + "loss": 0.0022, + "num_tokens": 29643472.0, + "reward": 1.788690447807312, + "reward_std": 0.308787077665329, + "rewards/fixed_code_pass_all_test_reward/mean": 0.788690447807312, + "rewards/fixed_code_pass_all_test_reward/std": 0.308787077665329, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 650.0, + "completions/mean_terminated_length": 650.0, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.6567054049068437, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.04806937696412206, + "learning_rate": 1.6991509196605664e-05, + "loss": 0.0019, + "num_tokens": 29657144.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 287.875, + "completions/mean_terminated_length": 287.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.6568898727172109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.05940876272507012, + "learning_rate": 1.6989206735795634e-05, + "loss": 0.0024, + "num_tokens": 29666199.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 552.25, + "completions/mean_terminated_length": 552.25, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.657074340527578, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.040676050120964646, + "learning_rate": 1.698690355039113e-05, + "loss": 0.0016, + "num_tokens": 29682265.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 478.25, + "completions/mean_terminated_length": 478.25, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.657258808337945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.04319406393915415, + "learning_rate": 1.6984599640630927e-05, + "loss": 0.0017, + "num_tokens": 29691235.0, + "reward": 1.9812500476837158, + "reward_std": 0.0530330166220665, + "rewards/fixed_code_pass_all_test_reward/mean": 0.981249988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.053033001720905304, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 403.25, + "completions/mean_terminated_length": 403.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.6574432761483121, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.014921379159204662, + "learning_rate": 1.6982295006753883e-05, + "loss": 0.0006, + "num_tokens": 29698837.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6576277439586792, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.07079661986790597, + "learning_rate": 1.6979989648998928e-05, + "loss": 0.0028, + "num_tokens": 29706971.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 389.5, + "completions/mean_terminated_length": 389.5, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.6578122117690463, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.03142881614621729, + "learning_rate": 1.697768356760506e-05, + "loss": 0.0013, + "num_tokens": 29714487.0, + "reward": 1.3318965435028076, + "reward_std": 0.0853404551744461, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3318965435028076, + "rewards/fixed_code_pass_all_test_reward/std": 0.0853404700756073, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 380.125, + "completions/mean_terminated_length": 380.125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6579966795794134, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.05338608706369996, + "learning_rate": 1.6975376762811365e-05, + "loss": 0.0021, + "num_tokens": 29726440.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 487.0, + "completions/mean_terminated_length": 487.0, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.6581811473897805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.03242480428889394, + "learning_rate": 1.6973069234856995e-05, + "loss": 0.0013, + "num_tokens": 29738104.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 362.625, + "completions/mean_terminated_length": 362.625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6583656152001476, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.04973982577212155, + "learning_rate": 1.6970760983981174e-05, + "loss": 0.002, + "num_tokens": 29745925.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.6585500830105147, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.234375, + "kl": 0.04524690145626664, + "learning_rate": 1.6968452010423212e-05, + "loss": 0.0018, + "num_tokens": 29750124.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 441.625, + "completions/mean_terminated_length": 441.625, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.6587345508208817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.029386924114078283, + "learning_rate": 1.6966142314422487e-05, + "loss": 0.0012, + "num_tokens": 29761337.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.6589190186312488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.041866542655043304, + "learning_rate": 1.6963831896218453e-05, + "loss": 0.0017, + "num_tokens": 29765498.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 526.375, + "completions/mean_terminated_length": 526.375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.659103486441616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.02973798383027315, + "learning_rate": 1.6961520756050643e-05, + "loss": 0.0012, + "num_tokens": 29774941.0, + "reward": 1.5, + "reward_std": 0.076360322535038, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.07636036723852158, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 349.75, + "completions/mean_terminated_length": 349.75, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.659287954251983, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.050248213578015566, + "learning_rate": 1.695920889415865e-05, + "loss": 0.002, + "num_tokens": 29783883.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 366.0, + "completions/mean_terminated_length": 366.0, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6594724220623501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.04503906494937837, + "learning_rate": 1.6956896310782158e-05, + "loss": 0.0018, + "num_tokens": 29792155.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 263.875, + "completions/mean_terminated_length": 263.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.6596568898727172, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.04810227151028812, + "learning_rate": 1.6954583006160923e-05, + "loss": 0.0019, + "num_tokens": 29797442.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 399.625, + "completions/mean_terminated_length": 399.625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.6598413576830843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.03518701670691371, + "learning_rate": 1.695226898053477e-05, + "loss": 0.0014, + "num_tokens": 29808303.0, + "reward": 1.633802890777588, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6338028311729431, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 363.0, + "completions/mean_terminated_length": 363.0, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.6600258254934513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.043448704993352294, + "learning_rate": 1.6949954234143603e-05, + "loss": 0.0017, + "num_tokens": 29817071.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 397.25, + "completions/mean_terminated_length": 397.25, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6602102933038185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.07032757764682174, + "learning_rate": 1.6947638767227398e-05, + "loss": 0.0028, + "num_tokens": 29827089.0, + "reward": 1.5749999284744263, + "reward_std": 0.6363961100578308, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 398.5, + "completions/mean_terminated_length": 398.5, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.6603947611141856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.040227144258096814, + "learning_rate": 1.694532258002621e-05, + "loss": 0.0016, + "num_tokens": 29838677.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 248.5, + "completions/mean_terminated_length": 248.5, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.6605792289245527, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.04096542880870402, + "learning_rate": 1.6943005672780164e-05, + "loss": 0.0016, + "num_tokens": 29843481.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 772.625, + "completions/mean_terminated_length": 772.625, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.6607636967349197, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6640625, + "kl": 0.03150696854572743, + "learning_rate": 1.6940688045729458e-05, + "loss": 0.0013, + "num_tokens": 29862262.0, + "reward": 1.5714285373687744, + "reward_std": 0.4948716461658478, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.49487167596817017, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 560.0, + "completions/mean_terminated_length": 560.0, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.6609481645452868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.03657388058491051, + "learning_rate": 1.6938369699114376e-05, + "loss": 0.0015, + "num_tokens": 29874310.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 469.375, + "completions/mean_terminated_length": 469.375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.6611326323556539, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.054257039446383715, + "learning_rate": 1.6936050633175263e-05, + "loss": 0.0022, + "num_tokens": 29886097.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 274.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.6613171001660211, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.06538175465539098, + "learning_rate": 1.6933730848152544e-05, + "loss": 0.0026, + "num_tokens": 29892299.0, + "reward": 1.671875, + "reward_std": 0.3531585931777954, + "rewards/fixed_code_pass_all_test_reward/mean": 0.671875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3531585931777954, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 237.625, + "completions/mean_terminated_length": 237.625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.6615015679763881, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.07822249410673976, + "learning_rate": 1.6931410344286722e-05, + "loss": 0.0031, + "num_tokens": 29897880.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 731.875, + "completions/mean_terminated_length": 543.857177734375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6616860357867552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38671875, + "kl": 0.018130575361283263, + "learning_rate": 1.6929089121818375e-05, + "loss": 0.0007, + "num_tokens": 29907247.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 404.625, + "completions/mean_terminated_length": 404.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.6618705035971223, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.05864599673077464, + "learning_rate": 1.692676718098814e-05, + "loss": 0.0023, + "num_tokens": 29917476.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 278.25, + "completions/mean_terminated_length": 278.25, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6620549714074894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.027187709463760257, + "learning_rate": 1.692444452203675e-05, + "loss": 0.0011, + "num_tokens": 29922454.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 420.125, + "completions/mean_terminated_length": 420.125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.6622394392178564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.0859684725292027, + "learning_rate": 1.6922121145205e-05, + "loss": 0.0034, + "num_tokens": 29933687.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 463.875, + "completions/mean_terminated_length": 463.875, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.6624239070282236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.03686350304633379, + "learning_rate": 1.6919797050733765e-05, + "loss": 0.0015, + "num_tokens": 29942454.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.6626083748385907, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.06840147660113871, + "learning_rate": 1.6917472238863988e-05, + "loss": 0.0027, + "num_tokens": 29952088.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.6627928426489578, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.515625, + "kl": 0.058992582373321056, + "learning_rate": 1.691514670983669e-05, + "loss": 0.0024, + "num_tokens": 29957500.0, + "reward": 1.6749999523162842, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 338.375, + "completions/mean_terminated_length": 338.375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.6629773104593248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36328125, + "kl": 0.05894047557376325, + "learning_rate": 1.691282046389297e-05, + "loss": 0.0024, + "num_tokens": 29964487.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 496.875, + "completions/mean_terminated_length": 496.875, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6631617782696919, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.024673782143509015, + "learning_rate": 1.691049350127399e-05, + "loss": 0.001, + "num_tokens": 29973606.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 236.5, + "completions/mean_terminated_length": 236.5, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.663346246080059, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.09928623866289854, + "learning_rate": 1.6908165822221004e-05, + "loss": 0.004, + "num_tokens": 29978474.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 530.25, + "completions/mean_terminated_length": 530.25, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.6635307138904262, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.04722412442788482, + "learning_rate": 1.6905837426975324e-05, + "loss": 0.0019, + "num_tokens": 29990364.0, + "reward": 1.0520832538604736, + "reward_std": 0.14731387794017792, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0520833320915699, + "rewards/fixed_code_pass_all_test_reward/std": 0.1473139077425003, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 440.375, + "completions/mean_terminated_length": 440.375, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.6637151817007932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.029176080133765936, + "learning_rate": 1.6903508315778342e-05, + "loss": 0.0012, + "num_tokens": 29998839.0, + "reward": 0.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 315.125, + "completions/mean_terminated_length": 315.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.6638996495111603, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.06269190064631402, + "learning_rate": 1.690117848887153e-05, + "loss": 0.0025, + "num_tokens": 30005440.0, + "reward": 1.6126374006271362, + "reward_std": 0.26735803484916687, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6126374006271362, + "rewards/fixed_code_pass_all_test_reward/std": 0.26735803484916687, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 420.625, + "completions/mean_terminated_length": 420.625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.6640841173215274, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.034976087510585785, + "learning_rate": 1.6898847946496428e-05, + "loss": 0.0014, + "num_tokens": 30018597.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 334.0, + "completions/mean_terminated_length": 334.0, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.6642685851318945, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.05378284677863121, + "learning_rate": 1.6896516688894648e-05, + "loss": 0.0022, + "num_tokens": 30027829.0, + "reward": 1.3583333492279053, + "reward_std": 0.28270238637924194, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3583333492279053, + "rewards/fixed_code_pass_all_test_reward/std": 0.28270238637924194, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 353.0, + "completions/mean_terminated_length": 353.0, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6644530529422615, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.04154221131466329, + "learning_rate": 1.6894184716307877e-05, + "loss": 0.0017, + "num_tokens": 30034581.0, + "reward": 1.5723683834075928, + "reward_std": 0.13473652303218842, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5723683834075928, + "rewards/fixed_code_pass_all_test_reward/std": 0.1347365379333496, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 247.875, + "completions/mean_terminated_length": 247.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6646375207526287, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.05066532548516989, + "learning_rate": 1.6891852028977883e-05, + "loss": 0.002, + "num_tokens": 30044308.0, + "reward": 1.2904412746429443, + "reward_std": 0.15872822701931, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2904411852359772, + "rewards/fixed_code_pass_all_test_reward/std": 0.1587281972169876, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 717.0, + "completions/mean_terminated_length": 717.0, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "epoch": 0.6648219885629958, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.02740553067997098, + "learning_rate": 1.6889518627146505e-05, + "loss": 0.0011, + "num_tokens": 30062468.0, + "reward": 1.3125, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 94.5, + "completions/mean_terminated_length": 94.5, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.6650064563733629, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.15625, + "kl": 0.09422318125143647, + "learning_rate": 1.6887184511055648e-05, + "loss": 0.0038, + "num_tokens": 30066112.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 188.5, + "completions/mean_terminated_length": 188.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.6651909241837299, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.056179068284109235, + "learning_rate": 1.68848496809473e-05, + "loss": 0.0022, + "num_tokens": 30070548.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 460.375, + "completions/mean_terminated_length": 460.375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.665375391994097, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.07556358119472861, + "learning_rate": 1.6882514137063525e-05, + "loss": 0.003, + "num_tokens": 30080423.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 501.375, + "completions/mean_terminated_length": 501.375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.6655598598044641, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.029042242909781635, + "learning_rate": 1.688017787964646e-05, + "loss": 0.0012, + "num_tokens": 30088722.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 228.25, + "completions/mean_terminated_length": 228.25, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6657443276148313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.05258803511969745, + "learning_rate": 1.6877840908938295e-05, + "loss": 0.0021, + "num_tokens": 30093468.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 243.125, + "completions/mean_terminated_length": 243.125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.6659287954251983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.03798463998828083, + "learning_rate": 1.687550322518133e-05, + "loss": 0.0015, + "num_tokens": 30098213.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 403.375, + "completions/mean_terminated_length": 403.375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.6661132632355654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.04534350661560893, + "learning_rate": 1.687316482861791e-05, + "loss": 0.0018, + "num_tokens": 30106856.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 281.0, + "completions/mean_terminated_length": 281.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.6662977310459325, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.04544827644713223, + "learning_rate": 1.6870825719490473e-05, + "loss": 0.0018, + "num_tokens": 30116056.0, + "reward": 1.9956896305084229, + "reward_std": 0.01219149399548769, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9956896305084229, + "rewards/fixed_code_pass_all_test_reward/std": 0.012191502377390862, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 192.125, + "completions/mean_terminated_length": 192.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.6664821988562996, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.140625, + "kl": 0.05729085230268538, + "learning_rate": 1.6868485898041514e-05, + "loss": 0.0023, + "num_tokens": 30120457.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 162.625, + "completions/mean_terminated_length": 162.625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.240234375, + "kl": 0.12405193597078323, + "learning_rate": 1.6866145364513613e-05, + "loss": 0.005, + "num_tokens": 30124758.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 170.25, + "completions/mean_terminated_length": 170.25, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6668511344770337, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.625, + "kl": 0.04399727890267968, + "learning_rate": 1.686380411914942e-05, + "loss": 0.0018, + "num_tokens": 30129128.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 316.0, + "completions/mean_terminated_length": 316.0, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.6670356022874009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.03026028093881905, + "learning_rate": 1.6861462162191665e-05, + "loss": 0.0012, + "num_tokens": 30134864.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 518.25, + "completions/mean_terminated_length": 518.25, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.667220070097768, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.05629445053637028, + "learning_rate": 1.6859119493883137e-05, + "loss": 0.0023, + "num_tokens": 30144266.0, + "reward": 1.4500000476837158, + "reward_std": 0.4869731366634369, + "rewards/fixed_code_pass_all_test_reward/mean": 0.44999998807907104, + "rewards/fixed_code_pass_all_test_reward/std": 0.4869731664657593, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 209.75, + "completions/mean_terminated_length": 209.75, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.667404537908135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.021892332122661173, + "learning_rate": 1.6856776114466717e-05, + "loss": 0.0009, + "num_tokens": 30149304.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 226.25, + "completions/mean_terminated_length": 226.25, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.6675890057185021, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.03007274738047272, + "learning_rate": 1.685443202418535e-05, + "loss": 0.0012, + "num_tokens": 30154178.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 506.125, + "completions/mean_terminated_length": 506.125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6677734735288692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.024745488539338112, + "learning_rate": 1.685208722328205e-05, + "loss": 0.001, + "num_tokens": 30163275.0, + "reward": 1.1750000715255737, + "reward_std": 0.0707106813788414, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17500001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 368.0, + "completions/mean_terminated_length": 368.0, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.6679579413392362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.050702113658189774, + "learning_rate": 1.6849741711999915e-05, + "loss": 0.002, + "num_tokens": 30170707.0, + "reward": 1.692307710647583, + "reward_std": 0.43319571018218994, + "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.43319568037986755, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 713.375, + "completions/mean_terminated_length": 713.375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.6681424091496034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.03704624925740063, + "learning_rate": 1.684739549058211e-05, + "loss": 0.0015, + "num_tokens": 30183262.0, + "reward": 1.6458332538604736, + "reward_std": 0.49149513244628906, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6458333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.49149513244628906, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 260.25, + "completions/mean_terminated_length": 260.25, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.6683268769599705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.023277232074178755, + "learning_rate": 1.684504855927188e-05, + "loss": 0.0009, + "num_tokens": 30189168.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 356.5, + "completions/mean_terminated_length": 356.5, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6685113447703376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.030712550156749785, + "learning_rate": 1.6842700918312532e-05, + "loss": 0.0012, + "num_tokens": 30195588.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 306.125, + "completions/mean_terminated_length": 306.125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.6686958125807047, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.06425913004204631, + "learning_rate": 1.6840352567947457e-05, + "loss": 0.0026, + "num_tokens": 30204101.0, + "reward": 1.84375, + "reward_std": 0.35197147727012634, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.35197150707244873, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 211.375, + "completions/mean_terminated_length": 211.375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.6688802803910717, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.026010525063611567, + "learning_rate": 1.6838003508420117e-05, + "loss": 0.001, + "num_tokens": 30208712.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 263.625, + "completions/mean_terminated_length": 263.625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.6690647482014388, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.050524536054581404, + "learning_rate": 1.683565373997405e-05, + "loss": 0.002, + "num_tokens": 30216885.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 337.375, + "completions/mean_terminated_length": 337.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.669249216011806, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.025121041224338114, + "learning_rate": 1.683330326285286e-05, + "loss": 0.001, + "num_tokens": 30223168.0, + "reward": 1.375, + "reward_std": 0.3284160792827606, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.328416109085083, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 439.5, + "completions/mean_terminated_length": 439.5, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.669433683822173, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.03377341921441257, + "learning_rate": 1.6830952077300227e-05, + "loss": 0.0014, + "num_tokens": 30233700.0, + "reward": 1.7159090042114258, + "reward_std": 0.28927093744277954, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7159091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.28927096724510193, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 462.25, + "completions/mean_terminated_length": 462.25, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.6696181516325401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83984375, + "kl": 0.03839342808350921, + "learning_rate": 1.6828600183559914e-05, + "loss": 0.0015, + "num_tokens": 30242854.0, + "reward": 1.100000023841858, + "reward_std": 0.1069045215845108, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10000000149011612, + "rewards/fixed_code_pass_all_test_reward/std": 0.10690450668334961, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 415.75, + "completions/mean_terminated_length": 415.75, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.6698026194429072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.040028820745646954, + "learning_rate": 1.6826247581875744e-05, + "loss": 0.0016, + "num_tokens": 30255428.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 169.0, + "completions/mean_terminated_length": 169.0, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.6699870872532743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.03933128924109042, + "learning_rate": 1.6823894272491617e-05, + "loss": 0.0016, + "num_tokens": 30259516.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 190.75, + "completions/mean_terminated_length": 190.75, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.6701715550636413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.140625, + "kl": 0.06784466980025172, + "learning_rate": 1.6821540255651512e-05, + "loss": 0.0027, + "num_tokens": 30263970.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 293.75, + "completions/mean_terminated_length": 293.75, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6703560228740085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.05190662620589137, + "learning_rate": 1.681918553159948e-05, + "loss": 0.0021, + "num_tokens": 30272896.0, + "reward": 1.683333396911621, + "reward_std": 0.36121487617492676, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6833333373069763, + "rewards/fixed_code_pass_all_test_reward/std": 0.36121487617492676, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 289.125, + "completions/mean_terminated_length": 289.125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6705404906843756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.031712502939626575, + "learning_rate": 1.6816830100579637e-05, + "loss": 0.0013, + "num_tokens": 30278369.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 338.875, + "completions/mean_terminated_length": 338.875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.6707249584947427, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.04547358653508127, + "learning_rate": 1.6814473962836184e-05, + "loss": 0.0018, + "num_tokens": 30285936.0, + "reward": 1.0520833730697632, + "reward_std": 0.06200197711586952, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0520833358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.06200198829174042, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 212.75, + "completions/mean_terminated_length": 212.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.6709094263051097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.02471423684619367, + "learning_rate": 1.6812117118613386e-05, + "loss": 0.001, + "num_tokens": 30290414.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 293.25, + "completions/mean_terminated_length": 293.25, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.6710938941154768, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.07697925763204694, + "learning_rate": 1.6809759568155587e-05, + "loss": 0.0031, + "num_tokens": 30297224.0, + "reward": 1.2916667461395264, + "reward_std": 0.7000566720962524, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5416666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 520.0, + "completions/mean_terminated_length": 520.0, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.6712783619258439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0277099609375, + "kl": 0.022352686617523432, + "learning_rate": 1.6807401311707203e-05, + "loss": 0.0009, + "num_tokens": 30308480.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 366.0, + "completions/mean_terminated_length": 366.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.6714628297362111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.08344901911914349, + "learning_rate": 1.680504234951272e-05, + "loss": 0.0033, + "num_tokens": 30317008.0, + "reward": 1.5, + "reward_std": 0.47245559096336365, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.47245559096336365, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 285.125, + "completions/mean_terminated_length": 285.125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.6716472975465781, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.041772550670430064, + "learning_rate": 1.6802682681816703e-05, + "loss": 0.0017, + "num_tokens": 30323353.0, + "reward": 1.519230842590332, + "reward_std": 0.05439284071326256, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, + "rewards/fixed_code_pass_all_test_reward/std": 0.054392825812101364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 371.875, + "completions/mean_terminated_length": 371.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.6718317653569452, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.064312475733459, + "learning_rate": 1.680032230886378e-05, + "loss": 0.0026, + "num_tokens": 30331856.0, + "reward": 1.734375, + "reward_std": 0.33698704838752747, + "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, + "rewards/fixed_code_pass_all_test_reward/std": 0.33698704838752747, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 338.0, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.6720162331673123, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.06125589879229665, + "learning_rate": 1.6797961230898665e-05, + "loss": 0.0025, + "num_tokens": 30338856.0, + "reward": 1.7437500953674316, + "reward_std": 0.29693374037742615, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7437499761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 206.625, + "completions/mean_terminated_length": 206.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.6722007009776794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.025104926084168255, + "learning_rate": 1.6795599448166138e-05, + "loss": 0.001, + "num_tokens": 30343645.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 273.375, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.6723851687880464, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.07905304967425764, + "learning_rate": 1.679323696091105e-05, + "loss": 0.0032, + "num_tokens": 30351120.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 252.75, + "completions/mean_terminated_length": 252.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6725696365984136, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.75, + "kl": 0.08563129277899861, + "learning_rate": 1.6790873769378327e-05, + "loss": 0.0034, + "num_tokens": 30356710.0, + "reward": 1.600000023841858, + "reward_std": 0.4276179373264313, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6000000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.42761799693107605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 674.75, + "completions/mean_terminated_length": 674.75, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "epoch": 0.6727541044087807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.74609375, + "kl": 0.04026862815953791, + "learning_rate": 1.6788509873812976e-05, + "loss": 0.0016, + "num_tokens": 30368948.0, + "reward": 1.5625, + "reward_std": 0.47087812423706055, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.47087812423706055, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 223.25, + "completions/mean_terminated_length": 223.25, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6729385722191478, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.07020091847516596, + "learning_rate": 1.6786145274460066e-05, + "loss": 0.0028, + "num_tokens": 30373654.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 233.625, + "completions/mean_terminated_length": 233.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6731230400295148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.018735545221716166, + "learning_rate": 1.678377997156474e-05, + "loss": 0.0007, + "num_tokens": 30378931.0, + "reward": 1.8888888359069824, + "reward_std": 0.20573778450489044, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6733075078398819, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.03307704837061465, + "learning_rate": 1.678141396537222e-05, + "loss": 0.0013, + "num_tokens": 30389091.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 409.375, + "completions/mean_terminated_length": 409.375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.673491975650249, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.05448441533371806, + "learning_rate": 1.67790472561278e-05, + "loss": 0.0022, + "num_tokens": 30400118.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 220.25, + "completions/mean_terminated_length": 220.25, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.6736764434606162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.03550602833274752, + "learning_rate": 1.677667984407684e-05, + "loss": 0.0014, + "num_tokens": 30404728.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 365.625, + "completions/mean_terminated_length": 365.625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.6738609112709832, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.033871169900521636, + "learning_rate": 1.6774311729464777e-05, + "loss": 0.0014, + "num_tokens": 30416701.0, + "reward": 1.5, + "reward_std": 0.022580957040190697, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.022580984979867935, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.6740453790813503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06201171875, + "kl": 0.04331885394640267, + "learning_rate": 1.6771942912537128e-05, + "loss": 0.0017, + "num_tokens": 30423210.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 190.375, + "completions/mean_terminated_length": 190.375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.6742298468917174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2275390625, + "kl": 0.05692193889990449, + "learning_rate": 1.6769573393539465e-05, + "loss": 0.0023, + "num_tokens": 30427717.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 233.5, + "completions/mean_terminated_length": 233.5, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.6744143147020845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.03306740149855614, + "learning_rate": 1.6767203172717457e-05, + "loss": 0.0013, + "num_tokens": 30437441.0, + "reward": 1.9038461446762085, + "reward_std": 0.2719641625881195, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9038461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.6745987825124515, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0, + "kl": 0.06323527102358639, + "learning_rate": 1.6764832250316827e-05, + "loss": 0.0025, + "num_tokens": 30443021.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 294.75, + "completions/mean_terminated_length": 294.75, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.6747832503228187, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1552734375, + "kl": 0.07102045184001327, + "learning_rate": 1.6762460626583378e-05, + "loss": 0.0028, + "num_tokens": 30450547.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 189.375, + "completions/mean_terminated_length": 189.375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6749677181331858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.054579525254666805, + "learning_rate": 1.6760088301762975e-05, + "loss": 0.0022, + "num_tokens": 30454990.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.6751521859435529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.033556015929207206, + "learning_rate": 1.675771527610158e-05, + "loss": 0.0013, + "num_tokens": 30464288.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 231.5, + "completions/mean_terminated_length": 231.5, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.6753366537539199, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.03203866514377296, + "learning_rate": 1.6755341549845198e-05, + "loss": 0.0013, + "num_tokens": 30469476.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.675521121564287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.03386988490819931, + "learning_rate": 1.6752967123239933e-05, + "loss": 0.0014, + "num_tokens": 30473928.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6757055893746541, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.04499918804503977, + "learning_rate": 1.6750591996531942e-05, + "loss": 0.0018, + "num_tokens": 30482087.0, + "reward": 1.9795454740524292, + "reward_std": 0.05785420909523964, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9795454740524292, + "rewards/fixed_code_pass_all_test_reward/std": 0.05785420164465904, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 480.375, + "completions/mean_terminated_length": 480.375, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.6758900571850213, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.053676349343732, + "learning_rate": 1.6748216169967465e-05, + "loss": 0.0021, + "num_tokens": 30496882.0, + "reward": 1.519230842590332, + "reward_std": 0.5147855877876282, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5192307829856873, + "rewards/fixed_code_pass_all_test_reward/std": 0.514785647392273, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 331.625, + "completions/mean_terminated_length": 331.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.6760745249953883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.054519183468073606, + "learning_rate": 1.674583964379281e-05, + "loss": 0.0022, + "num_tokens": 30520343.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 391.75, + "completions/mean_terminated_length": 391.75, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.6762589928057554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.640625, + "kl": 0.06618706090375781, + "learning_rate": 1.674346241825437e-05, + "loss": 0.0026, + "num_tokens": 30527437.0, + "reward": 1.8958332538604736, + "reward_std": 0.294627845287323, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 328.125, + "completions/mean_terminated_length": 328.125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.6764434606161225, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.08858846966177225, + "learning_rate": 1.674108449359858e-05, + "loss": 0.0035, + "num_tokens": 30538302.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 630.125, + "completions/mean_terminated_length": 630.125, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "epoch": 0.6766279284264896, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.040105503518134356, + "learning_rate": 1.6738705870071986e-05, + "loss": 0.0016, + "num_tokens": 30554223.0, + "reward": 1.2857142686843872, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2857142984867096, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 265.375, + "completions/mean_terminated_length": 265.375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.6768123962368566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.020336553105153143, + "learning_rate": 1.6736326547921177e-05, + "loss": 0.0008, + "num_tokens": 30562034.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 477.875, + "completions/mean_terminated_length": 477.875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.6769968640472238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.045014480827376246, + "learning_rate": 1.6733946527392832e-05, + "loss": 0.0018, + "num_tokens": 30574513.0, + "reward": 1.375, + "reward_std": 0.2653239965438843, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2653239965438843, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 213.0, + "completions/mean_terminated_length": 213.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.6771813318575909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2119140625, + "kl": 0.059637637343257666, + "learning_rate": 1.673156580873369e-05, + "loss": 0.0024, + "num_tokens": 30580593.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 219.5, + "completions/mean_terminated_length": 219.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.677365799667958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.029901932226493955, + "learning_rate": 1.6729184392190575e-05, + "loss": 0.0012, + "num_tokens": 30585189.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 426.25, + "completions/mean_terminated_length": 426.25, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.677550267478325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.04053531982935965, + "learning_rate": 1.6726802278010365e-05, + "loss": 0.0016, + "num_tokens": 30594663.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 379.25, + "completions/mean_terminated_length": 379.25, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.6777347352886921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1005859375, + "kl": 0.06520006060600281, + "learning_rate": 1.6724419466440035e-05, + "loss": 0.0026, + "num_tokens": 30604673.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 480.125, + "completions/mean_terminated_length": 480.125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.6779192030990592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.039823418483138084, + "learning_rate": 1.6722035957726607e-05, + "loss": 0.0016, + "num_tokens": 30613386.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 454.75, + "completions/mean_terminated_length": 454.75, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.6781036709094264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84765625, + "kl": 0.04387431056238711, + "learning_rate": 1.6719651752117198e-05, + "loss": 0.0018, + "num_tokens": 30621880.0, + "reward": 1.46875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 358.625, + "completions/mean_terminated_length": 358.625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.6782881387197934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.057102636666968465, + "learning_rate": 1.6717266849858978e-05, + "loss": 0.0023, + "num_tokens": 30631581.0, + "reward": 1.932692289352417, + "reward_std": 0.1903749406337738, + "rewards/fixed_code_pass_all_test_reward/mean": 0.932692289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.19037489593029022, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 197.875, + "completions/mean_terminated_length": 197.875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.6784726065301605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2099609375, + "kl": 0.072473946493119, + "learning_rate": 1.6714881251199204e-05, + "loss": 0.0029, + "num_tokens": 30636044.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 578.25, + "completions/mean_terminated_length": 578.25, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.6786570743405276, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.734375, + "kl": 0.033939379965886474, + "learning_rate": 1.6712494956385195e-05, + "loss": 0.0014, + "num_tokens": 30648022.0, + "reward": 1.1184210777282715, + "reward_std": 0.059678610414266586, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1184210479259491, + "rewards/fixed_code_pass_all_test_reward/std": 0.05967859923839569, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 349.625, + "completions/mean_terminated_length": 349.625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.6788415421508947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.05604859907180071, + "learning_rate": 1.6710107965664354e-05, + "loss": 0.0022, + "num_tokens": 30656323.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 231.375, + "completions/mean_terminated_length": 231.375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.6790260099612617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.083984375, + "kl": 0.03550931951031089, + "learning_rate": 1.6707720279284138e-05, + "loss": 0.0014, + "num_tokens": 30661702.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 385.75, + "completions/mean_terminated_length": 385.75, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.6792104777716288, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.054418426705524325, + "learning_rate": 1.6705331897492088e-05, + "loss": 0.0022, + "num_tokens": 30672044.0, + "reward": 1.3191488981246948, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3191489279270172, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 192.5, + "completions/mean_terminated_length": 192.5, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.679394945581996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.018756062956526875, + "learning_rate": 1.6702942820535823e-05, + "loss": 0.0008, + "num_tokens": 30676440.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 483.25, + "completions/mean_terminated_length": 483.25, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.679579413392363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.05454242089763284, + "learning_rate": 1.6700553048663014e-05, + "loss": 0.0022, + "num_tokens": 30687514.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1202.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 553.25, + "completions/mean_terminated_length": 553.25, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.6797638812027301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6328125, + "kl": 0.029599675559438765, + "learning_rate": 1.669816258212143e-05, + "loss": 0.0012, + "num_tokens": 30699556.0, + "reward": 1.5933098793029785, + "reward_std": 0.4485257863998413, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7183098196983337, + "rewards/fixed_code_pass_all_test_reward/std": 0.16698987782001495, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 625.125, + "completions/mean_terminated_length": 625.125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.6799483490130972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.02558919822331518, + "learning_rate": 1.6695771421158894e-05, + "loss": 0.001, + "num_tokens": 30713629.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 411.625, + "completions/mean_terminated_length": 411.625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.6801328168234643, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.052797636250033975, + "learning_rate": 1.66933795660233e-05, + "loss": 0.0021, + "num_tokens": 30721330.0, + "reward": 1.900362253189087, + "reward_std": 0.2818179428577423, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9003623127937317, + "rewards/fixed_code_pass_all_test_reward/std": 0.2818179130554199, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.6803172846338313, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.0236003496684134, + "learning_rate": 1.669098701696263e-05, + "loss": 0.0009, + "num_tokens": 30725889.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 233.0, + "completions/mean_terminated_length": 233.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.6805017524441985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.043364679673686624, + "learning_rate": 1.6688593774224918e-05, + "loss": 0.0017, + "num_tokens": 30730625.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 396.375, + "completions/mean_terminated_length": 396.375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.6806862202545656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.05513603752478957, + "learning_rate": 1.6686199838058284e-05, + "loss": 0.0022, + "num_tokens": 30742692.0, + "reward": 1.2512136697769165, + "reward_std": 0.0908447802066803, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25121361017227173, + "rewards/fixed_code_pass_all_test_reward/std": 0.09084472805261612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 287.25, + "completions/mean_terminated_length": 287.25, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.6808706880649327, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.05261865328066051, + "learning_rate": 1.6683805208710915e-05, + "loss": 0.0021, + "num_tokens": 30748894.0, + "reward": 1.734375, + "reward_std": 0.36659735441207886, + "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, + "rewards/fixed_code_pass_all_test_reward/std": 0.36659735441207886, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 345.5, + "completions/mean_terminated_length": 345.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6810551558752997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.05463936785236001, + "learning_rate": 1.668140988643107e-05, + "loss": 0.0022, + "num_tokens": 30759506.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.6812396236856668, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.04200354893691838, + "learning_rate": 1.667901387146708e-05, + "loss": 0.0017, + "num_tokens": 30766830.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 403.375, + "completions/mean_terminated_length": 403.375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.6814240914960339, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.03389909607358277, + "learning_rate": 1.6676617164067346e-05, + "loss": 0.0014, + "num_tokens": 30774169.0, + "reward": 1.784482717514038, + "reward_std": 0.06957854330539703, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7844827175140381, + "rewards/fixed_code_pass_all_test_reward/std": 0.06957856565713882, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 626.625, + "completions/mean_terminated_length": 626.625, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.6816085593064011, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.734375, + "kl": 0.04003201948944479, + "learning_rate": 1.667421976448035e-05, + "loss": 0.0016, + "num_tokens": 30789254.0, + "reward": 1.829545497894287, + "reward_std": 0.3156205117702484, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8295454382896423, + "rewards/fixed_code_pass_all_test_reward/std": 0.3156205117702484, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 477.75, + "completions/mean_terminated_length": 477.75, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.6817930271167681, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.03479286341462284, + "learning_rate": 1.6671821672954628e-05, + "loss": 0.0014, + "num_tokens": 30801772.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.6819774949271352, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.07828248082660139, + "learning_rate": 1.6669422889738807e-05, + "loss": 0.0031, + "num_tokens": 30807838.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 175.5, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.6821619627375023, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.03397643659263849, + "learning_rate": 1.666702341508157e-05, + "loss": 0.0014, + "num_tokens": 30812098.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 334.0, + "completions/mean_terminated_length": 334.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.6823464305478694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.07354838983155787, + "learning_rate": 1.6664623249231685e-05, + "loss": 0.0029, + "num_tokens": 30821282.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 355.0, + "completions/mean_terminated_length": 355.0, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.6825308983582364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2470703125, + "kl": 0.07811527559533715, + "learning_rate": 1.6662222392437982e-05, + "loss": 0.0031, + "num_tokens": 30832650.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 225.0, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.6827153661686036, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.0605581016279757, + "learning_rate": 1.6659820844949362e-05, + "loss": 0.0024, + "num_tokens": 30838402.0, + "reward": 1.6349999904632568, + "reward_std": 0.2622430622577667, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6349999904632568, + "rewards/fixed_code_pass_all_test_reward/std": 0.2622430920600891, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 323.5, + "completions/mean_terminated_length": 323.5, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.6828998339789707, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.04834237997420132, + "learning_rate": 1.6657418607014808e-05, + "loss": 0.0019, + "num_tokens": 30850270.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 318.75, + "completions/mean_terminated_length": 318.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.6830843017893378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.043270950205624104, + "learning_rate": 1.6655015678883365e-05, + "loss": 0.0017, + "num_tokens": 30855620.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 655.625, + "completions/mean_terminated_length": 655.625, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "epoch": 0.6832687695997048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6640625, + "kl": 0.02660720539279282, + "learning_rate": 1.665261206080415e-05, + "loss": 0.0011, + "num_tokens": 30872817.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 277.5, + "completions/mean_terminated_length": 277.5, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.6834532374100719, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.65625, + "kl": 0.017618111392948776, + "learning_rate": 1.6650207753026366e-05, + "loss": 0.0007, + "num_tokens": 30878717.0, + "reward": 1.8958332538604736, + "reward_std": 0.294627845287323, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 327.375, + "completions/mean_terminated_length": 327.375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.683637705220439, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.03448926075361669, + "learning_rate": 1.6647802755799258e-05, + "loss": 0.0014, + "num_tokens": 30886880.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 392.5, + "completions/mean_terminated_length": 392.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.6838221730308062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.07361371186561882, + "learning_rate": 1.6645397069372175e-05, + "loss": 0.0029, + "num_tokens": 30896676.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 285.5, + "completions/mean_terminated_length": 285.5, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.6840066408411732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0537109375, + "kl": 0.023489110986702144, + "learning_rate": 1.664299069399451e-05, + "loss": 0.0009, + "num_tokens": 30906568.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6841911086515403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.07241898169741035, + "learning_rate": 1.664058362991575e-05, + "loss": 0.0029, + "num_tokens": 30912844.0, + "reward": 1.517045497894287, + "reward_std": 0.23673485219478607, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5170454978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.23673486709594727, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 443.625, + "completions/mean_terminated_length": 443.625, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.6843755764619074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.88671875, + "kl": 0.036237545078620315, + "learning_rate": 1.6638175877385442e-05, + "loss": 0.0014, + "num_tokens": 30921545.0, + "reward": 1.375, + "reward_std": 0.25253811478614807, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.25253814458847046, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 301.0, + "completions/mean_terminated_length": 301.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.6845600442722745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.060210245195776224, + "learning_rate": 1.66357674366532e-05, + "loss": 0.0024, + "num_tokens": 30930217.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 348.125, + "completions/mean_terminated_length": 348.125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.6847445120826415, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.0596439428627491, + "learning_rate": 1.6633358307968722e-05, + "loss": 0.0024, + "num_tokens": 30938858.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 378.0, + "completions/mean_terminated_length": 378.0, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.6849289798930087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.04982957674656063, + "learning_rate": 1.6630948491581763e-05, + "loss": 0.002, + "num_tokens": 30946562.0, + "reward": 1.9134615659713745, + "reward_std": 0.12462963908910751, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9134615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.12462963163852692, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 365.375, + "completions/mean_terminated_length": 365.375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.6851134477033758, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.044732645619660616, + "learning_rate": 1.6628537987742165e-05, + "loss": 0.0018, + "num_tokens": 30958557.0, + "reward": 1.9861111640930176, + "reward_std": 0.03928373008966446, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9861111044883728, + "rewards/fixed_code_pass_all_test_reward/std": 0.03928370773792267, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 295.625, + "completions/mean_terminated_length": 295.625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.6852979155137429, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.04951454885303974, + "learning_rate": 1.6626126796699828e-05, + "loss": 0.002, + "num_tokens": 30967930.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 221.25, + "completions/mean_terminated_length": 221.25, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.6854823833241099, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6875, + "kl": 0.06312663108110428, + "learning_rate": 1.6623714918704728e-05, + "loss": 0.0025, + "num_tokens": 30974876.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 584.75, + "completions/mean_terminated_length": 584.75, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.685666851134477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.028705685050226748, + "learning_rate": 1.6621302354006915e-05, + "loss": 0.0011, + "num_tokens": 30985450.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 738.0, + "completions/mean_terminated_length": 738.0, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "epoch": 0.6858513189448441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.02508676890283823, + "learning_rate": 1.6618889102856506e-05, + "loss": 0.001, + "num_tokens": 31002466.0, + "reward": 1.4444444179534912, + "reward_std": 0.4663894474506378, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4444444477558136, + "rewards/fixed_code_pass_all_test_reward/std": 0.4663894772529602, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 650.0, + "completions/max_terminated_length": 650.0, + "completions/mean_length": 498.0, + "completions/mean_terminated_length": 498.0, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.6860357867552113, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.10521011147648096, + "learning_rate": 1.661647516550369e-05, + "loss": 0.0042, + "num_tokens": 31011386.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 211.0, + "completions/mean_terminated_length": 211.0, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.6862202545655783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.02660631516482681, + "learning_rate": 1.661406054219873e-05, + "loss": 0.0011, + "num_tokens": 31016386.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 355.125, + "completions/mean_terminated_length": 355.125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.6864047223759454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12109375, + "kl": 0.059233935084193945, + "learning_rate": 1.6611645233191957e-05, + "loss": 0.0024, + "num_tokens": 31027163.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 372.25, + "completions/mean_terminated_length": 372.25, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.6865891901863125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.038138950476422906, + "learning_rate": 1.6609229238733776e-05, + "loss": 0.0015, + "num_tokens": 31036197.0, + "reward": 1.756250023841858, + "reward_std": 0.31672146916389465, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7562500238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.31672149896621704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 449.875, + "completions/mean_terminated_length": 449.875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.6867736579966796, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.03688846342265606, + "learning_rate": 1.660681255907466e-05, + "loss": 0.0015, + "num_tokens": 31047116.0, + "reward": 1.7697367668151855, + "reward_std": 0.017285054549574852, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7697368264198303, + "rewards/fixed_code_pass_all_test_reward/std": 0.0172850601375103, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.6869581258070466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.038218742702156305, + "learning_rate": 1.660439519446515e-05, + "loss": 0.0015, + "num_tokens": 31055852.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 352.875, + "completions/mean_terminated_length": 352.875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.6871425936174138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.79296875, + "kl": 0.03660299978218973, + "learning_rate": 1.660197714515587e-05, + "loss": 0.0015, + "num_tokens": 31063051.0, + "reward": 1.9673912525177002, + "reward_std": 0.09223129600286484, + "rewards/fixed_code_pass_all_test_reward/mean": 0.967391312122345, + "rewards/fixed_code_pass_all_test_reward/std": 0.09223131835460663, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 599.25, + "completions/mean_terminated_length": 599.25, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "epoch": 0.6873270614277809, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.03879402298480272, + "learning_rate": 1.65995584113975e-05, + "loss": 0.0016, + "num_tokens": 31076405.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 394.375, + "completions/mean_terminated_length": 394.375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.687511529238148, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.05983232660219073, + "learning_rate": 1.65971389934408e-05, + "loss": 0.0024, + "num_tokens": 31085816.0, + "reward": 1.6486486196517944, + "reward_std": 0.05004430189728737, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6486486196517944, + "rewards/fixed_code_pass_all_test_reward/std": 0.050044331699609756, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.687695997048515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.05465828743763268, + "learning_rate": 1.659471889153661e-05, + "loss": 0.0022, + "num_tokens": 31091249.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 416.625, + "completions/mean_terminated_length": 416.625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.6878804648588821, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.040180700132623315, + "learning_rate": 1.6592298105935814e-05, + "loss": 0.0016, + "num_tokens": 31100854.0, + "reward": 1.34375, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 299.125, + "completions/mean_terminated_length": 299.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.6880649326692492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0546875, + "kl": 0.04227416682988405, + "learning_rate": 1.6589876636889392e-05, + "loss": 0.0017, + "num_tokens": 31109759.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 300.25, + "completions/mean_terminated_length": 300.25, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.6882494004796164, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.11573283281177282, + "learning_rate": 1.658745448464838e-05, + "loss": 0.0046, + "num_tokens": 31118129.0, + "reward": 1.9419643878936768, + "reward_std": 0.06313452869653702, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9419642686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.06313455104827881, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 309.375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.6884338682899834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.03768581850454211, + "learning_rate": 1.65850316494639e-05, + "loss": 0.0015, + "num_tokens": 31124788.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 699.75, + "completions/mean_terminated_length": 699.75, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "epoch": 0.6886183361003505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.03282665414735675, + "learning_rate": 1.658260813158713e-05, + "loss": 0.0013, + "num_tokens": 31139770.0, + "reward": 1.7788461446762085, + "reward_std": 0.41411587595939636, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7788461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.41411590576171875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 330.25, + "completions/mean_terminated_length": 330.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.6888028039107176, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.02671283797826618, + "learning_rate": 1.6580183931269318e-05, + "loss": 0.0011, + "num_tokens": 31148548.0, + "reward": 1.8041666746139526, + "reward_std": 0.27970364689826965, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8041666746139526, + "rewards/fixed_code_pass_all_test_reward/std": 0.27970364689826965, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 382.0, + "completions/mean_terminated_length": 382.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.6889872717210846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.059421731159090996, + "learning_rate": 1.65777590487618e-05, + "loss": 0.0024, + "num_tokens": 31156820.0, + "reward": 1.6836419105529785, + "reward_std": 0.26196935772895813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6836419701576233, + "rewards/fixed_code_pass_all_test_reward/std": 0.26196932792663574, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 138.5, + "completions/mean_terminated_length": 138.5, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6891717395314517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.091796875, + "kl": 0.05504172947257757, + "learning_rate": 1.6575333484315964e-05, + "loss": 0.0022, + "num_tokens": 31160768.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 442.375, + "completions/mean_terminated_length": 442.375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.6893562073418189, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.625, + "kl": 0.022291914792731404, + "learning_rate": 1.657290723818328e-05, + "loss": 0.0009, + "num_tokens": 31173811.0, + "reward": 1.6229338645935059, + "reward_std": 0.13969051837921143, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6229339241981506, + "rewards/fixed_code_pass_all_test_reward/std": 0.1396905481815338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.689540675152186, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.0274247172055766, + "learning_rate": 1.6570480310615285e-05, + "loss": 0.0011, + "num_tokens": 31180298.0, + "reward": 1.1875, + "reward_std": 0.3282996118068695, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3282995820045471, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 315.0, + "completions/mean_terminated_length": 315.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.689725142962553, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1513671875, + "kl": 0.08229600777849555, + "learning_rate": 1.6568052701863586e-05, + "loss": 0.0033, + "num_tokens": 31189074.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 288.625, + "completions/mean_terminated_length": 288.625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6899096107729201, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.029903542017564178, + "learning_rate": 1.656562441217986e-05, + "loss": 0.0012, + "num_tokens": 31194415.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 412.5, + "completions/mean_terminated_length": 412.5, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.6900940785832872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.04061175766400993, + "learning_rate": 1.6563195441815855e-05, + "loss": 0.0016, + "num_tokens": 31206131.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 369.375, + "completions/mean_terminated_length": 369.375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.6902785463936543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.022111187456175685, + "learning_rate": 1.6560765791023395e-05, + "loss": 0.0009, + "num_tokens": 31213926.0, + "reward": 1.2000000476837158, + "reward_std": 0.38544961810112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.20000000298023224, + "rewards/fixed_code_pass_all_test_reward/std": 0.38544967770576477, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 221.625, + "completions/mean_terminated_length": 221.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.6904630142040215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.04408752894960344, + "learning_rate": 1.6558335460054367e-05, + "loss": 0.0018, + "num_tokens": 31218947.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 330.125, + "completions/mean_terminated_length": 330.125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.6906474820143885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.04815824655815959, + "learning_rate": 1.6555904449160734e-05, + "loss": 0.0019, + "num_tokens": 31226260.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 358.5, + "completions/mean_terminated_length": 358.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6908319498247556, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.05263505480252206, + "learning_rate": 1.6553472758594522e-05, + "loss": 0.0021, + "num_tokens": 31233432.0, + "reward": 1.875, + "reward_std": 0.23754699528217316, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.23754698038101196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 216.875, + "completions/mean_terminated_length": 216.875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.6910164176351227, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.042811705032363534, + "learning_rate": 1.6551040388607838e-05, + "loss": 0.0017, + "num_tokens": 31238175.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 378.375, + "completions/mean_terminated_length": 378.375, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.6912008854454897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.025162800797261298, + "learning_rate": 1.6548607339452853e-05, + "loss": 0.001, + "num_tokens": 31245602.0, + "reward": 1.4074074029922485, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.40740740299224854, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 356.0, + "completions/mean_terminated_length": 356.0, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.6913853532558568, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.04650401254184544, + "learning_rate": 1.6546173611381805e-05, + "loss": 0.0019, + "num_tokens": 31255274.0, + "reward": 1.8125, + "reward_std": 0.33407655358314514, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.06681530922651291, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 397.125, + "completions/mean_terminated_length": 397.125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6915698210662239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.05231584212742746, + "learning_rate": 1.654373920464701e-05, + "loss": 0.0021, + "num_tokens": 31263923.0, + "reward": 1.1875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 390.875, + "completions/mean_terminated_length": 390.875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.6917542888765911, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.034739643801003695, + "learning_rate": 1.6541304119500853e-05, + "loss": 0.0014, + "num_tokens": 31271290.0, + "reward": 1.7406914234161377, + "reward_std": 0.01880604401230812, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7406914830207825, + "rewards/fixed_code_pass_all_test_reward/std": 0.018806051462888718, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 555.75, + "completions/mean_terminated_length": 555.75, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "epoch": 0.6919387566869581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.023392248433083296, + "learning_rate": 1.6538868356195787e-05, + "loss": 0.0009, + "num_tokens": 31285776.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 737.125, + "completions/mean_terminated_length": 737.125, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "epoch": 0.6921232244973252, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040771484375, + "kl": 0.021976477495627478, + "learning_rate": 1.653643191498433e-05, + "loss": 0.0009, + "num_tokens": 31302281.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 328.625, + "completions/mean_terminated_length": 328.625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.6923076923076923, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.04640924069099128, + "learning_rate": 1.653399479611908e-05, + "loss": 0.0019, + "num_tokens": 31309894.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 346.875, + "completions/mean_terminated_length": 346.875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.6924921601180594, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.030307766748592257, + "learning_rate": 1.6531556999852703e-05, + "loss": 0.0012, + "num_tokens": 31321237.0, + "reward": 1.8562500476837158, + "reward_std": 0.23366260528564453, + "rewards/fixed_code_pass_all_test_reward/mean": 0.856249988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.23366262018680573, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 266.125, + "completions/mean_terminated_length": 266.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.6926766279284264, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.0664343957323581, + "learning_rate": 1.652911852643793e-05, + "loss": 0.0027, + "num_tokens": 31327390.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 334.25, + "completions/mean_terminated_length": 334.25, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.6928610957387936, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.039893782464787364, + "learning_rate": 1.652667937612757e-05, + "loss": 0.0016, + "num_tokens": 31334312.0, + "reward": 1.3333333730697632, + "reward_std": 0.35634827613830566, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3333333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.35634830594062805, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 408.25, + "completions/mean_terminated_length": 408.25, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6930455635491607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.03531626146286726, + "learning_rate": 1.6524239549174496e-05, + "loss": 0.0014, + "num_tokens": 31348082.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 178.5, + "completions/mean_terminated_length": 178.5, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.6932300313595278, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05047107767313719, + "learning_rate": 1.652179904583165e-05, + "loss": 0.002, + "num_tokens": 31352222.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 699.25, + "completions/mean_terminated_length": 699.25, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "epoch": 0.6934144991698948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.60546875, + "kl": 0.022191372932866216, + "learning_rate": 1.651935786635205e-05, + "loss": 0.0009, + "num_tokens": 31364880.0, + "reward": 1.7757353782653809, + "reward_std": 0.22887110710144043, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7757352590560913, + "rewards/fixed_code_pass_all_test_reward/std": 0.22887110710144043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 155.625, + "completions/mean_terminated_length": 155.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.6935989669802619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.024636907037347555, + "learning_rate": 1.6516916010988784e-05, + "loss": 0.001, + "num_tokens": 31369053.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 314.875, + "completions/mean_terminated_length": 314.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.693783434790629, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.05058677773922682, + "learning_rate": 1.6514473479995003e-05, + "loss": 0.002, + "num_tokens": 31377700.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 405.5, + "completions/mean_terminated_length": 405.5, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.6939679026009962, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.03490825439803302, + "learning_rate": 1.651203027362393e-05, + "loss": 0.0014, + "num_tokens": 31385592.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 381.5, + "completions/mean_terminated_length": 381.5, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.6941523704113632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.025275283493101597, + "learning_rate": 1.6509586392128865e-05, + "loss": 0.001, + "num_tokens": 31392868.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 212.75, + "completions/mean_terminated_length": 212.75, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.6943368382217303, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.03417039941996336, + "learning_rate": 1.6507141835763173e-05, + "loss": 0.0014, + "num_tokens": 31402306.0, + "reward": 1.4595588445663452, + "reward_std": 0.15318132936954498, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4595588445663452, + "rewards/fixed_code_pass_all_test_reward/std": 0.15318137407302856, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6945213060320974, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.0877725169993937, + "learning_rate": 1.650469660478029e-05, + "loss": 0.0035, + "num_tokens": 31409622.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 590.125, + "completions/mean_terminated_length": 590.125, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "epoch": 0.6947057738424645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.03946866886690259, + "learning_rate": 1.650225069943372e-05, + "loss": 0.0016, + "num_tokens": 31425255.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 383.875, + "completions/mean_terminated_length": 383.875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.6948902416528315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.058194668497890234, + "learning_rate": 1.649980411997704e-05, + "loss": 0.0023, + "num_tokens": 31432806.0, + "reward": 1.8571429252624512, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8571428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 331.5, + "completions/mean_terminated_length": 331.5, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6950747094631987, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.4375, + "kl": 0.08639300125651062, + "learning_rate": 1.649735686666389e-05, + "loss": 0.0035, + "num_tokens": 31439122.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 305.375, + "completions/mean_terminated_length": 305.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6952591772735658, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.05590819241479039, + "learning_rate": 1.649490893974799e-05, + "loss": 0.0022, + "num_tokens": 31449501.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 137.875, + "completions/mean_terminated_length": 137.875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.6954436450839329, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.09375, + "kl": 0.08549389196559787, + "learning_rate": 1.649246033948312e-05, + "loss": 0.0034, + "num_tokens": 31453356.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 312.0, + "completions/mean_terminated_length": 312.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.6956281128942999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.05683800647966564, + "learning_rate": 1.649001106612314e-05, + "loss": 0.0023, + "num_tokens": 31461892.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 226.75, + "completions/mean_terminated_length": 226.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.695812580704667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.048190937377512455, + "learning_rate": 1.648756111992197e-05, + "loss": 0.0019, + "num_tokens": 31470426.0, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 313.25, + "completions/mean_terminated_length": 313.25, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.6959970485150341, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06945065618492663, + "learning_rate": 1.648511050113361e-05, + "loss": 0.0028, + "num_tokens": 31479740.0, + "reward": 1.8023256063461304, + "reward_std": 0.3662329316139221, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8023256063461304, + "rewards/fixed_code_pass_all_test_reward/std": 0.3662329614162445, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 319.5, + "completions/mean_terminated_length": 319.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.6961815163254013, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.05162624339573085, + "learning_rate": 1.6482659210012117e-05, + "loss": 0.0021, + "num_tokens": 31489240.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.6963659841357683, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.07444684160873294, + "learning_rate": 1.648020724681163e-05, + "loss": 0.003, + "num_tokens": 31493477.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 259.75, + "completions/mean_terminated_length": 259.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.6965504519461354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.02291830931790173, + "learning_rate": 1.647775461178635e-05, + "loss": 0.0009, + "num_tokens": 31499067.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 683.625, + "completions/mean_terminated_length": 683.625, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.6967349197565025, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.68359375, + "kl": 0.034360678400844336, + "learning_rate": 1.6475301305190546e-05, + "loss": 0.0014, + "num_tokens": 31515248.0, + "reward": 1.9659091234207153, + "reward_std": 0.09642363339662552, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9659091234207153, + "rewards/fixed_code_pass_all_test_reward/std": 0.09642364084720612, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.6969193875668696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.055777879199013114, + "learning_rate": 1.6472847327278563e-05, + "loss": 0.0022, + "num_tokens": 31523157.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 285.75, + "completions/mean_terminated_length": 285.75, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.6971038553772366, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.0304083910305053, + "learning_rate": 1.647039267830482e-05, + "loss": 0.0012, + "num_tokens": 31531507.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 534.0, + "completions/mean_terminated_length": 534.0, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.6972883231876038, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.85546875, + "kl": 0.038748868741095066, + "learning_rate": 1.6467937358523788e-05, + "loss": 0.0016, + "num_tokens": 31540955.0, + "reward": 1.4821429252624512, + "reward_std": 0.9161254167556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7321428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.45456865429878235, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 182.25, + "completions/mean_terminated_length": 182.25, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.6974727909979709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.04506985703483224, + "learning_rate": 1.6465481368190025e-05, + "loss": 0.0018, + "num_tokens": 31545157.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 292.75, + "completions/mean_terminated_length": 292.75, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.697657258808338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.02738941623829305, + "learning_rate": 1.646302470755815e-05, + "loss": 0.0011, + "num_tokens": 31551819.0, + "reward": 1.4886363744735718, + "reward_std": 0.30321967601776123, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4886363446712494, + "rewards/fixed_code_pass_all_test_reward/std": 0.30321964621543884, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 404.125, + "completions/mean_terminated_length": 404.125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.697841726618705, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.039601166266947985, + "learning_rate": 1.6460567376882854e-05, + "loss": 0.0016, + "num_tokens": 31563508.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 232.0, + "completions/mean_terminated_length": 232.0, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6980261944290721, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.04861987382173538, + "learning_rate": 1.6458109376418896e-05, + "loss": 0.0019, + "num_tokens": 31571212.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 365.125, + "completions/mean_terminated_length": 365.125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.6982106622394392, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.050927909556776285, + "learning_rate": 1.6455650706421103e-05, + "loss": 0.002, + "num_tokens": 31578725.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 224.375, + "completions/mean_terminated_length": 224.375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.6983951300498064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.04488826100714505, + "learning_rate": 1.6453191367144377e-05, + "loss": 0.0018, + "num_tokens": 31583440.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6985795978601734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.061344979563727975, + "learning_rate": 1.6450731358843685e-05, + "loss": 0.0025, + "num_tokens": 31594916.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 265.625, + "completions/mean_terminated_length": 265.625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.6987640656705405, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.04234885121695697, + "learning_rate": 1.6448270681774062e-05, + "loss": 0.0017, + "num_tokens": 31603753.0, + "reward": 1.2941176891326904, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.29411765933036804, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 306.25, + "completions/mean_terminated_length": 306.25, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.6989485334809076, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.06471045897342265, + "learning_rate": 1.6445809336190618e-05, + "loss": 0.0026, + "num_tokens": 31613947.0, + "reward": 1.9500000476837158, + "reward_std": 0.1414213627576828, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213478565216, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 302.0, + "completions/mean_terminated_length": 302.0, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6991330012912746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.07278218492865562, + "learning_rate": 1.6443347322348526e-05, + "loss": 0.0029, + "num_tokens": 31623323.0, + "reward": 1.8899999856948853, + "reward_std": 0.3111270070075989, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8899999856948853, + "rewards/fixed_code_pass_all_test_reward/std": 0.3111269772052765, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 256.125, + "completions/mean_terminated_length": 256.125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.6993174691016417, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.046646937960758805, + "learning_rate": 1.6440884640503035e-05, + "loss": 0.0019, + "num_tokens": 31632084.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 394.625, + "completions/mean_terminated_length": 394.625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.6995019369120089, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.048393404576927423, + "learning_rate": 1.6438421290909453e-05, + "loss": 0.0019, + "num_tokens": 31643505.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 416.75, + "completions/mean_terminated_length": 416.75, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.699686404722376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.03222534840460867, + "learning_rate": 1.6435957273823172e-05, + "loss": 0.0013, + "num_tokens": 31652015.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.699870872532743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037353515625, + "kl": 0.02264452597592026, + "learning_rate": 1.643349258949964e-05, + "loss": 0.0009, + "num_tokens": 31657683.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 352.125, + "completions/mean_terminated_length": 352.125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.7000553403431101, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.038124822080135345, + "learning_rate": 1.643102723819438e-05, + "loss": 0.0015, + "num_tokens": 31665164.0, + "reward": 1.855263113975525, + "reward_std": 0.2738432288169861, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8552631139755249, + "rewards/fixed_code_pass_all_test_reward/std": 0.2738432288169861, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 342.75, + "completions/mean_terminated_length": 342.75, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.7002398081534772, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.03819207020569593, + "learning_rate": 1.6428561220162983e-05, + "loss": 0.0015, + "num_tokens": 31674378.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 230.0, + "completions/mean_terminated_length": 230.0, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.7004242759638443, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.04304635152220726, + "learning_rate": 1.6426094535661104e-05, + "loss": 0.0017, + "num_tokens": 31681906.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 323.625, + "completions/mean_terminated_length": 323.625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.7006087437742115, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.06083651352673769, + "learning_rate": 1.6423627184944484e-05, + "loss": 0.0024, + "num_tokens": 31691703.0, + "reward": 1.682692289352417, + "reward_std": 0.42345142364501953, + "rewards/fixed_code_pass_all_test_reward/mean": 0.682692289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.42345142364501953, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 256.75, + "completions/mean_terminated_length": 256.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.7007932115845785, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.484375, + "kl": 0.06564672710373998, + "learning_rate": 1.6421159168268915e-05, + "loss": 0.0026, + "num_tokens": 31697621.0, + "reward": 1.4375, + "reward_std": 0.3471825420856476, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 434.5, + "completions/mean_terminated_length": 434.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.7009776793949456, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.029246112098917365, + "learning_rate": 1.641869048589026e-05, + "loss": 0.0012, + "num_tokens": 31707345.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 216.625, + "completions/mean_terminated_length": 216.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.7011621472053127, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.0412779722828418, + "learning_rate": 1.6416221138064464e-05, + "loss": 0.0017, + "num_tokens": 31715974.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 191.125, + "completions/mean_terminated_length": 191.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.7013466150156797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.01906467555090785, + "learning_rate": 1.641375112504753e-05, + "loss": 0.0008, + "num_tokens": 31720759.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 686.625, + "completions/mean_terminated_length": 686.625, + "completions/min_length": 621.0, + "completions/min_terminated_length": 621.0, + "epoch": 0.7015310828260468, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0220947265625, + "kl": 0.018477863166481256, + "learning_rate": 1.641128044709553e-05, + "loss": 0.0007, + "num_tokens": 31733588.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 575.625, + "completions/mean_terminated_length": 575.625, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.701715550636414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.03571358881890774, + "learning_rate": 1.640880910446461e-05, + "loss": 0.0014, + "num_tokens": 31744777.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 248.0, + "completions/mean_terminated_length": 248.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.7019000184467811, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.029027531621977687, + "learning_rate": 1.640633709741098e-05, + "loss": 0.0012, + "num_tokens": 31750945.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 116.875, + "completions/mean_terminated_length": 116.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7020844862571481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.030808203504420817, + "learning_rate": 1.6403864426190925e-05, + "loss": 0.0012, + "num_tokens": 31754592.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 351.125, + "completions/mean_terminated_length": 351.125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.7022689540675152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.0548455259995535, + "learning_rate": 1.640139109106079e-05, + "loss": 0.0022, + "num_tokens": 31764145.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 219.0, + "completions/mean_terminated_length": 219.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.7024534218778823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.04162910836748779, + "learning_rate": 1.6398917092277e-05, + "loss": 0.0017, + "num_tokens": 31768953.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 206.25, + "completions/mean_terminated_length": 206.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.7026378896882494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.04900571936741471, + "learning_rate": 1.6396442430096032e-05, + "loss": 0.002, + "num_tokens": 31777107.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 323.375, + "completions/mean_terminated_length": 323.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.7028223574986165, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.022142526227980852, + "learning_rate": 1.6393967104774458e-05, + "loss": 0.0009, + "num_tokens": 31785790.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 250.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.7030068253089836, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.05158341769129038, + "learning_rate": 1.639149111656889e-05, + "loss": 0.0021, + "num_tokens": 31791673.0, + "reward": 1.4728260040283203, + "reward_std": 0.015371894463896751, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4728260934352875, + "rewards/fixed_code_pass_all_test_reward/std": 0.015371893532574177, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 175.5, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.7031912931193507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.0340567163657397, + "learning_rate": 1.638901446573603e-05, + "loss": 0.0014, + "num_tokens": 31795821.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 239.5, + "completions/mean_terminated_length": 239.5, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7033757609297178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.03761658305302262, + "learning_rate": 1.6386537152532637e-05, + "loss": 0.0015, + "num_tokens": 31805121.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 137.125, + "completions/mean_terminated_length": 137.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7035602287400848, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.02764666045550257, + "learning_rate": 1.6384059177215544e-05, + "loss": 0.0011, + "num_tokens": 31808930.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 356.875, + "completions/mean_terminated_length": 356.875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7037446965504519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.039056660141795874, + "learning_rate": 1.6381580540041652e-05, + "loss": 0.0016, + "num_tokens": 31817249.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 288.125, + "completions/mean_terminated_length": 288.125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.703929164360819, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.06562286498956382, + "learning_rate": 1.6379101241267923e-05, + "loss": 0.0026, + "num_tokens": 31825330.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 385.25, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.7041136321711862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.04143834952265024, + "learning_rate": 1.63766212811514e-05, + "loss": 0.0017, + "num_tokens": 31833140.0, + "reward": 1.1691176891326904, + "reward_std": 0.020797276869416237, + "rewards/fixed_code_pass_all_test_reward/mean": 0.16911765933036804, + "rewards/fixed_code_pass_all_test_reward/std": 0.020797260105609894, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 249.375, + "completions/mean_terminated_length": 249.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.7042980999815532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.06332773389294744, + "learning_rate": 1.6374140659949193e-05, + "loss": 0.0025, + "num_tokens": 31837911.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 370.0, + "completions/mean_terminated_length": 370.0, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7044825677919203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07373046875, + "kl": 0.05057633062824607, + "learning_rate": 1.6371659377918466e-05, + "loss": 0.002, + "num_tokens": 31845543.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 490.875, + "completions/mean_terminated_length": 490.875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.7046670356022874, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.03673546458594501, + "learning_rate": 1.6369177435316465e-05, + "loss": 0.0015, + "num_tokens": 31857318.0, + "reward": 1.4715908765792847, + "reward_std": 0.32884496450424194, + "rewards/fixed_code_pass_all_test_reward/mean": 0.47159093618392944, + "rewards/fixed_code_pass_all_test_reward/std": 0.32884499430656433, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 281.375, + "completions/mean_terminated_length": 281.375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.7048515034126545, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.04988833353854716, + "learning_rate": 1.6366694832400508e-05, + "loss": 0.002, + "num_tokens": 31865065.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.7050359712230215, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.203125, + "kl": 0.07212846912443638, + "learning_rate": 1.636421156942797e-05, + "loss": 0.0029, + "num_tokens": 31873914.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 263.5, + "completions/mean_terminated_length": 263.5, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.7052204390333887, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.06595373246818781, + "learning_rate": 1.63617276466563e-05, + "loss": 0.0026, + "num_tokens": 31881726.0, + "reward": 1.7386362552642822, + "reward_std": 0.34850838780403137, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7386363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.348508358001709, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 155.5, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.7054049068437558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.06758049596101046, + "learning_rate": 1.635924306434301e-05, + "loss": 0.0027, + "num_tokens": 31885922.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 207.625, + "completions/mean_terminated_length": 207.625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.7055893746541229, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.05185369309037924, + "learning_rate": 1.6356757822745692e-05, + "loss": 0.0021, + "num_tokens": 31893559.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 233.0, + "completions/mean_terminated_length": 233.0, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.7057738424644899, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.054493971867486835, + "learning_rate": 1.6354271922121992e-05, + "loss": 0.0022, + "num_tokens": 31901223.0, + "reward": 1.8897058963775635, + "reward_std": 0.31195884943008423, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8897058963775635, + "rewards/fixed_code_pass_all_test_reward/std": 0.3119588792324066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 284.0, + "completions/mean_terminated_length": 284.0, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.705958310274857, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.039790594251826406, + "learning_rate": 1.635178536272964e-05, + "loss": 0.0016, + "num_tokens": 31911239.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 283.875, + "completions/mean_terminated_length": 283.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.7061427780852241, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.055719359777867794, + "learning_rate": 1.634929814482642e-05, + "loss": 0.0022, + "num_tokens": 31917486.0, + "reward": 1.875, + "reward_std": 0.2829941511154175, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.28299421072006226, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 446.375, + "completions/mean_terminated_length": 446.375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.7063272458955913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03955078125, + "kl": 0.02822624403052032, + "learning_rate": 1.634681026867019e-05, + "loss": 0.0011, + "num_tokens": 31925697.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 250.5, + "completions/mean_terminated_length": 250.5, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.7065117137059583, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.125, + "kl": 0.16608937783166766, + "learning_rate": 1.6344321734518884e-05, + "loss": 0.0066, + "num_tokens": 31933677.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 296.0, + "completions/mean_terminated_length": 296.0, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.7066961815163254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.08262318256311119, + "learning_rate": 1.6341832542630486e-05, + "loss": 0.0033, + "num_tokens": 31942453.0, + "reward": 1.625, + "reward_std": 0.27319857478141785, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.27319860458374023, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.7068806493266925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.08549753716215491, + "learning_rate": 1.6339342693263067e-05, + "loss": 0.0034, + "num_tokens": 31951370.0, + "reward": 1.653846263885498, + "reward_std": 0.477737694978714, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.477737694978714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 192.5, + "completions/mean_terminated_length": 192.5, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.7070651171370596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2216796875, + "kl": 0.033953696954995394, + "learning_rate": 1.633685218667475e-05, + "loss": 0.0014, + "num_tokens": 31956198.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 326.5, + "completions/mean_terminated_length": 326.5, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7072495849474266, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.059018890373408794, + "learning_rate": 1.6334361023123743e-05, + "loss": 0.0024, + "num_tokens": 31967770.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 395.125, + "completions/mean_terminated_length": 395.125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7074340527577938, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.031166540924459696, + "learning_rate": 1.6331869202868308e-05, + "loss": 0.0012, + "num_tokens": 31975931.0, + "reward": 1.6750000715255737, + "reward_std": 0.348124235868454, + "rewards/fixed_code_pass_all_test_reward/mean": 0.675000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.348124235868454, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 145.375, + "completions/mean_terminated_length": 145.375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.7076185205681609, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29296875, + "kl": 0.049973453977145255, + "learning_rate": 1.632937672616678e-05, + "loss": 0.002, + "num_tokens": 31979966.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 287.25, + "completions/mean_terminated_length": 287.25, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.707802988378528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.03371069743297994, + "learning_rate": 1.6326883593277568e-05, + "loss": 0.0013, + "num_tokens": 31987992.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 289.375, + "completions/mean_terminated_length": 289.375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.707987456188895, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6953125, + "kl": 0.052454553777351975, + "learning_rate": 1.632438980445914e-05, + "loss": 0.0021, + "num_tokens": 31994459.0, + "reward": 1.5163042545318604, + "reward_std": 0.424370676279068, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5163043737411499, + "rewards/fixed_code_pass_all_test_reward/std": 0.4243707060813904, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 224.5, + "completions/mean_terminated_length": 224.5, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.7081719239992621, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.04932348383590579, + "learning_rate": 1.632189535997003e-05, + "loss": 0.002, + "num_tokens": 32001327.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.7083563918096292, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.014756768650840968, + "learning_rate": 1.6319400260068854e-05, + "loss": 0.0006, + "num_tokens": 32007726.0, + "reward": 1.5443549156188965, + "reward_std": 0.25658419728279114, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5443547964096069, + "rewards/fixed_code_pass_all_test_reward/std": 0.25658416748046875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 257.625, + "completions/mean_terminated_length": 257.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7085408596199964, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5625, + "kl": 0.05971781606785953, + "learning_rate": 1.631690450501428e-05, + "loss": 0.0024, + "num_tokens": 32015195.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 519.125, + "completions/mean_terminated_length": 519.125, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.7087253274303634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73046875, + "kl": 0.04590933315921575, + "learning_rate": 1.6314408095065062e-05, + "loss": 0.0018, + "num_tokens": 32026684.0, + "reward": 1.75, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 321.5, + "completions/mean_terminated_length": 321.5, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.7089097952407305, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.035430294228717685, + "learning_rate": 1.631191103048e-05, + "loss": 0.0014, + "num_tokens": 32036648.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 216.875, + "completions/mean_terminated_length": 216.875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.7090942630510976, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.09375, + "kl": 0.35876134247519076, + "learning_rate": 1.6309413311517975e-05, + "loss": 0.0144, + "num_tokens": 32044927.0, + "reward": 1.2619047164916992, + "reward_std": 0.42667755484580994, + "rewards/fixed_code_pass_all_test_reward/mean": 0.261904776096344, + "rewards/fixed_code_pass_all_test_reward/std": 0.4266776144504547, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 316.0, + "completions/mean_terminated_length": 316.0, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.7092787308614646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.01867935643531382, + "learning_rate": 1.6306914938437943e-05, + "loss": 0.0007, + "num_tokens": 32051159.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 515.375, + "completions/mean_terminated_length": 515.375, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.7094631986718317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.021954603493213654, + "learning_rate": 1.6304415911498907e-05, + "loss": 0.0009, + "num_tokens": 32061418.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 407.75, + "completions/mean_terminated_length": 407.75, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.7096476664821989, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0849609375, + "kl": 0.04198176274076104, + "learning_rate": 1.6301916230959953e-05, + "loss": 0.0017, + "num_tokens": 32067720.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 526.5, + "completions/mean_terminated_length": 526.5, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.709832134292566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.05899894016329199, + "learning_rate": 1.6299415897080234e-05, + "loss": 0.0024, + "num_tokens": 32080916.0, + "reward": 1.59375, + "reward_std": 0.4419417381286621, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4419417679309845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 204.75, + "completions/mean_terminated_length": 204.75, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.710016602102933, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.042614942183718085, + "learning_rate": 1.629691491011897e-05, + "loss": 0.0017, + "num_tokens": 32087890.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 251.125, + "completions/mean_terminated_length": 251.125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.7102010699133001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.201171875, + "kl": 0.06426581903360784, + "learning_rate": 1.6294413270335437e-05, + "loss": 0.0026, + "num_tokens": 32094315.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 374.5, + "completions/mean_terminated_length": 374.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.7103855377236672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.06572533422149718, + "learning_rate": 1.6291910977988998e-05, + "loss": 0.0026, + "num_tokens": 32101343.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 228.5, + "completions/mean_terminated_length": 228.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.7105700055340343, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.10688682459294796, + "learning_rate": 1.6289408033339073e-05, + "loss": 0.0043, + "num_tokens": 32108803.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 279.625, + "completions/mean_terminated_length": 279.625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.7107544733444014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.02985777670983225, + "learning_rate": 1.6286904436645145e-05, + "loss": 0.0012, + "num_tokens": 32113760.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 262.0, + "completions/mean_terminated_length": 262.0, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.7109389411547685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.028253308322746307, + "learning_rate": 1.6284400188166776e-05, + "loss": 0.0011, + "num_tokens": 32118768.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 332.75, + "completions/mean_terminated_length": 332.75, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7111234089651356, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.04902401682920754, + "learning_rate": 1.6281895288163587e-05, + "loss": 0.002, + "num_tokens": 32127598.0, + "reward": 1.9010417461395264, + "reward_std": 0.27989640831947327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9010416865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.27989643812179565, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 225.375, + "completions/mean_terminated_length": 225.375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.7113078767755027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.07027927599847317, + "learning_rate": 1.627938973689527e-05, + "loss": 0.0028, + "num_tokens": 32134049.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 325.625, + "completions/mean_terminated_length": 325.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.7114923445858697, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0322265625, + "kl": 0.01673012087121606, + "learning_rate": 1.6276883534621582e-05, + "loss": 0.0007, + "num_tokens": 32141102.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 197.375, + "completions/mean_terminated_length": 197.375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.7116768123962368, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.05023044999688864, + "learning_rate": 1.6274376681602353e-05, + "loss": 0.002, + "num_tokens": 32145625.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 374.875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.711861280206604, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.03454492520540953, + "learning_rate": 1.6271869178097474e-05, + "loss": 0.0014, + "num_tokens": 32153640.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 251.25, + "completions/mean_terminated_length": 251.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.7120457480169711, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.04574429360218346, + "learning_rate": 1.626936102436691e-05, + "loss": 0.0018, + "num_tokens": 32159578.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 180.0, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.7122302158273381, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.0628775724908337, + "learning_rate": 1.626685222067068e-05, + "loss": 0.0025, + "num_tokens": 32163938.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 285.25, + "completions/mean_terminated_length": 285.25, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7124146836377052, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.04103571688756347, + "learning_rate": 1.6264342767268892e-05, + "loss": 0.0016, + "num_tokens": 32169540.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 299.5, + "completions/mean_terminated_length": 299.5, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.7125991514480723, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.04030704067554325, + "learning_rate": 1.6261832664421705e-05, + "loss": 0.0016, + "num_tokens": 32176048.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 333.375, + "completions/mean_terminated_length": 333.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7127836192584394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.05137320491485298, + "learning_rate": 1.6259321912389348e-05, + "loss": 0.0021, + "num_tokens": 32187123.0, + "reward": 1.8888888359069824, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 336.625, + "completions/mean_terminated_length": 336.625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.7129680870688065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2373046875, + "kl": 0.04837149614468217, + "learning_rate": 1.625681051143212e-05, + "loss": 0.0019, + "num_tokens": 32196856.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 295.875, + "completions/mean_terminated_length": 295.875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.7131525548791736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.03209849586710334, + "learning_rate": 1.625429846181039e-05, + "loss": 0.0013, + "num_tokens": 32205007.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 336.5, + "completions/mean_terminated_length": 336.5, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7133370226895407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.027820597169920802, + "learning_rate": 1.6251785763784586e-05, + "loss": 0.0011, + "num_tokens": 32213859.0, + "reward": 1.8958332538604736, + "reward_std": 0.294627845287323, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 302.875, + "completions/mean_terminated_length": 302.875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.7135214904999078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.04559844429604709, + "learning_rate": 1.6249272417615202e-05, + "loss": 0.0018, + "num_tokens": 32224154.0, + "reward": 1.954545497894287, + "reward_std": 0.03763990476727486, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9545454978942871, + "rewards/fixed_code_pass_all_test_reward/std": 0.037639934569597244, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 247.875, + "completions/mean_terminated_length": 247.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.7137059583102748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.70703125, + "kl": 0.041830229689367115, + "learning_rate": 1.624675842356282e-05, + "loss": 0.0017, + "num_tokens": 32229641.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 344.125, + "completions/mean_terminated_length": 344.125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.7138904261206419, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.04438215680420399, + "learning_rate": 1.6244243781888064e-05, + "loss": 0.0018, + "num_tokens": 32238914.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 162.0, + "completions/mean_terminated_length": 162.0, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.7140748939310091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.052020782488398254, + "learning_rate": 1.6241728492851637e-05, + "loss": 0.0021, + "num_tokens": 32243154.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 462.0, + "completions/mean_terminated_length": 462.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7142593617413762, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.023980124155059457, + "learning_rate": 1.623921255671431e-05, + "loss": 0.001, + "num_tokens": 32251754.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 210.5, + "completions/mean_terminated_length": 210.5, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.7144438295517432, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.03125, + "kl": 0.050589021993801, + "learning_rate": 1.6236695973736916e-05, + "loss": 0.002, + "num_tokens": 32256542.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 323.625, + "completions/mean_terminated_length": 323.625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.7146282973621103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.047554214252159, + "learning_rate": 1.6234178744180357e-05, + "loss": 0.0019, + "num_tokens": 32262547.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 293.0, + "completions/mean_terminated_length": 293.0, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.7148127651724774, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.06492248130962253, + "learning_rate": 1.6231660868305603e-05, + "loss": 0.0026, + "num_tokens": 32268947.0, + "reward": 1.6306817531585693, + "reward_std": 0.11762481182813644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6306818127632141, + "rewards/fixed_code_pass_all_test_reward/std": 0.11762479692697525, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 169.625, + "completions/mean_terminated_length": 169.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.7149972329828445, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.0636580849532038, + "learning_rate": 1.6229142346373692e-05, + "loss": 0.0025, + "num_tokens": 32273272.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 315.125, + "completions/mean_terminated_length": 315.125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7151817007932116, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.03331404330674559, + "learning_rate": 1.622662317864573e-05, + "loss": 0.0013, + "num_tokens": 32281985.0, + "reward": 1.9791666269302368, + "reward_std": 0.058925606310367584, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9791666269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.0589255727827549, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 477.125, + "completions/mean_terminated_length": 477.125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.7153661686035787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80859375, + "kl": 0.04186164797283709, + "learning_rate": 1.622410336538288e-05, + "loss": 0.0017, + "num_tokens": 32296170.0, + "reward": 1.6782786846160889, + "reward_std": 0.4630914628505707, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8032786846160889, + "rewards/fixed_code_pass_all_test_reward/std": 0.3816539943218231, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 356.5, + "completions/mean_terminated_length": 356.5, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.7155506364139458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.03143377776723355, + "learning_rate": 1.6221582906846387e-05, + "loss": 0.0013, + "num_tokens": 32305294.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 453.75, + "completions/mean_terminated_length": 453.75, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.7157351042243129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.023263866896741092, + "learning_rate": 1.621906180329755e-05, + "loss": 0.0009, + "num_tokens": 32317716.0, + "reward": 1.7630208730697632, + "reward_std": 0.30536240339279175, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7630208730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.30536243319511414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 464.5, + "completions/mean_terminated_length": 464.5, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.7159195720346799, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.048990844399668276, + "learning_rate": 1.6216540054997743e-05, + "loss": 0.002, + "num_tokens": 32326720.0, + "reward": 1.8958334922790527, + "reward_std": 0.03303440287709236, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.033034369349479675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 367.125, + "completions/mean_terminated_length": 367.125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.716104039845047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.03421159158460796, + "learning_rate": 1.6214017662208407e-05, + "loss": 0.0014, + "num_tokens": 32334649.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 586.0, + "completions/mean_terminated_length": 586.0, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.7162885076554141, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.04030502657406032, + "learning_rate": 1.6211494625191043e-05, + "loss": 0.0016, + "num_tokens": 32345441.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.7164729754657813, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.06134243216365576, + "learning_rate": 1.620897094420722e-05, + "loss": 0.0025, + "num_tokens": 32353616.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 157.875, + "completions/mean_terminated_length": 157.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7166574432761483, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.453125, + "kl": 0.10858015157282352, + "learning_rate": 1.6206446619518587e-05, + "loss": 0.0043, + "num_tokens": 32357615.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 351.625, + "completions/mean_terminated_length": 351.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.7168419110865154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.04175499896518886, + "learning_rate": 1.6203921651386836e-05, + "loss": 0.0017, + "num_tokens": 32365492.0, + "reward": 1.46875, + "reward_std": 0.37796446681022644, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 301.75, + "completions/mean_terminated_length": 301.75, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.7170263788968825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.05503495829179883, + "learning_rate": 1.6201396040073745e-05, + "loss": 0.0022, + "num_tokens": 32373418.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 307.375, + "completions/mean_terminated_length": 307.375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.7172108467072495, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.06560879666358232, + "learning_rate": 1.6198869785841156e-05, + "loss": 0.0026, + "num_tokens": 32386053.0, + "reward": 1.6379311084747314, + "reward_std": 0.49970269203186035, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6379309892654419, + "rewards/fixed_code_pass_all_test_reward/std": 0.49970266222953796, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 337.875, + "completions/mean_terminated_length": 337.875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7173953145176166, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.04829346132464707, + "learning_rate": 1.6196342888950966e-05, + "loss": 0.0019, + "num_tokens": 32395788.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 370.375, + "completions/mean_terminated_length": 370.375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7175797823279838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.03954493743367493, + "learning_rate": 1.6193815349665157e-05, + "loss": 0.0016, + "num_tokens": 32406303.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 321.375, + "completions/mean_terminated_length": 321.375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.7177642501383509, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.05106371454894543, + "learning_rate": 1.6191287168245758e-05, + "loss": 0.002, + "num_tokens": 32412914.0, + "reward": 1.5250000953674316, + "reward_std": 0.2121320515871048, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5250000357627869, + "rewards/fixed_code_pass_all_test_reward/std": 0.2121320515871048, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 374.875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.717948717948718, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.0182959494413808, + "learning_rate": 1.618875834495488e-05, + "loss": 0.0007, + "num_tokens": 32423601.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 261.125, + "completions/mean_terminated_length": 261.125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.718133185759085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.05798303382471204, + "learning_rate": 1.618622888005469e-05, + "loss": 0.0023, + "num_tokens": 32429874.0, + "reward": 1.4299449920654297, + "reward_std": 0.2857406735420227, + "rewards/fixed_code_pass_all_test_reward/mean": 0.42994505167007446, + "rewards/fixed_code_pass_all_test_reward/std": 0.2857407033443451, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 238.875, + "completions/mean_terminated_length": 238.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.7183176535694521, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.038142868084833026, + "learning_rate": 1.6183698773807434e-05, + "loss": 0.0015, + "num_tokens": 32439921.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 376.25, + "completions/mean_terminated_length": 376.25, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.7185021213798192, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058349609375, + "kl": 0.030330535140819848, + "learning_rate": 1.6181168026475407e-05, + "loss": 0.0012, + "num_tokens": 32451707.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 185.875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.7186865891901864, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.06698972848244011, + "learning_rate": 1.617863663832099e-05, + "loss": 0.0027, + "num_tokens": 32456210.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.7188710570005534, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.03255838970653713, + "learning_rate": 1.617610460960661e-05, + "loss": 0.0013, + "num_tokens": 32463320.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 299.0, + "completions/mean_terminated_length": 299.0, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.7190555248109205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.0578594203107059, + "learning_rate": 1.6173571940594775e-05, + "loss": 0.0023, + "num_tokens": 32474048.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 365.375, + "completions/mean_terminated_length": 365.375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.7192399926212876, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.035128909978084266, + "learning_rate": 1.6171038631548056e-05, + "loss": 0.0014, + "num_tokens": 32481491.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 468.0, + "completions/mean_terminated_length": 468.0, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.7194244604316546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038330078125, + "kl": 0.018934093764983118, + "learning_rate": 1.6168504682729095e-05, + "loss": 0.0008, + "num_tokens": 32492251.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 328.375, + "completions/mean_terminated_length": 328.375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.7196089282420217, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.04249160597100854, + "learning_rate": 1.6165970094400584e-05, + "loss": 0.0017, + "num_tokens": 32501462.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 821.0, + "completions/max_terminated_length": 821.0, + "completions/mean_length": 742.5, + "completions/mean_terminated_length": 742.5, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "epoch": 0.7197933960523889, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.625, + "kl": 0.02753139554988593, + "learning_rate": 1.61634348668253e-05, + "loss": 0.0011, + "num_tokens": 32516578.0, + "reward": 1.78125, + "reward_std": 0.33905068039894104, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.33905068039894104, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 555.75, + "completions/mean_terminated_length": 555.75, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "epoch": 0.719977863862756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.02451376523822546, + "learning_rate": 1.616089900026608e-05, + "loss": 0.001, + "num_tokens": 32527312.0, + "reward": 1.9196429252624512, + "reward_std": 0.2272842973470688, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9196428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.22728432714939117, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 240.125, + "completions/mean_terminated_length": 240.125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.720162331673123, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.03667365899309516, + "learning_rate": 1.6158362494985817e-05, + "loss": 0.0015, + "num_tokens": 32532265.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 332.0, + "completions/mean_terminated_length": 332.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.7203467994834901, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0389222779776901, + "learning_rate": 1.615582535124749e-05, + "loss": 0.0016, + "num_tokens": 32540553.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 340.875, + "completions/mean_terminated_length": 340.875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7205312672938572, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.050610557897016406, + "learning_rate": 1.615328756931412e-05, + "loss": 0.002, + "num_tokens": 32551256.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 408.0, + "completions/mean_terminated_length": 408.0, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.7207157351042243, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0291748046875, + "kl": 0.021787960431538522, + "learning_rate": 1.615074914944882e-05, + "loss": 0.0009, + "num_tokens": 32561872.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 476.125, + "completions/mean_terminated_length": 476.125, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.7209002029145914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0308837890625, + "kl": 0.025926578673534095, + "learning_rate": 1.6148210091914753e-05, + "loss": 0.001, + "num_tokens": 32571297.0, + "reward": 1.1666666269302368, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1666666716337204, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 349.75, + "completions/mean_terminated_length": 349.75, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7210846707249585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.166015625, + "kl": 0.038321727653965354, + "learning_rate": 1.614567039697515e-05, + "loss": 0.0015, + "num_tokens": 32581879.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 272.5, + "completions/mean_terminated_length": 272.5, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.7212691385353256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.06289628148078918, + "learning_rate": 1.6143130064893305e-05, + "loss": 0.0025, + "num_tokens": 32588347.0, + "reward": 1.5208333730697632, + "reward_std": 0.4124789535999298, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5208333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.4124789535999298, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 407.0, + "completions/mean_terminated_length": 407.0, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7214536063456927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.099609375, + "kl": 0.041084932861849666, + "learning_rate": 1.614058909593259e-05, + "loss": 0.0016, + "num_tokens": 32600379.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 105.0, + "completions/mean_terminated_length": 105.0, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.7216380741560597, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.04542797524482012, + "learning_rate": 1.6138047490356432e-05, + "loss": 0.0018, + "num_tokens": 32603915.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.7218225419664268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.030650387285277247, + "learning_rate": 1.6135505248428328e-05, + "loss": 0.0012, + "num_tokens": 32613829.0, + "reward": 1.9821429252624512, + "reward_std": 0.05050760135054588, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 738.125, + "completions/mean_terminated_length": 738.125, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "epoch": 0.722007009776794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.026643901015631855, + "learning_rate": 1.6132962370411847e-05, + "loss": 0.0011, + "num_tokens": 32628958.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 210.5, + "completions/mean_terminated_length": 210.5, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.7221914775871611, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.796875, + "kl": 0.05347546795383096, + "learning_rate": 1.6130418856570606e-05, + "loss": 0.0021, + "num_tokens": 32637346.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.7223759453975281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.032074823742732406, + "learning_rate": 1.612787470716831e-05, + "loss": 0.0013, + "num_tokens": 32646412.0, + "reward": 1.915816307067871, + "reward_std": 0.23810741305351257, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9158163070678711, + "rewards/fixed_code_pass_all_test_reward/std": 0.23810741305351257, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 318.375, + "completions/mean_terminated_length": 318.375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.7225604132078952, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.056091839680448174, + "learning_rate": 1.6125329922468714e-05, + "loss": 0.0022, + "num_tokens": 32653431.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 528.625, + "completions/mean_terminated_length": 528.625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.7227448810182623, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.03367272554896772, + "learning_rate": 1.6122784502735647e-05, + "loss": 0.0013, + "num_tokens": 32663492.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 446.375, + "completions/mean_terminated_length": 217.57144165039062, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.7229293488286294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.046350834832992405, + "learning_rate": 1.6120238448232996e-05, + "loss": 0.0019, + "num_tokens": 32669847.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 209.5, + "completions/mean_terminated_length": 209.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.7231138166389965, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.08563888259232044, + "learning_rate": 1.6117691759224726e-05, + "loss": 0.0034, + "num_tokens": 32678531.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 275.875, + "completions/mean_terminated_length": 275.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.7232982844493636, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.04110580240376294, + "learning_rate": 1.611514443597486e-05, + "loss": 0.0016, + "num_tokens": 32685202.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 290.125, + "completions/mean_terminated_length": 290.125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.7234827522597307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.04443772812373936, + "learning_rate": 1.6112596478747482e-05, + "loss": 0.0018, + "num_tokens": 32691563.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 374.0, + "completions/mean_terminated_length": 374.0, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.7236672200700978, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.03798094438388944, + "learning_rate": 1.611004788780675e-05, + "loss": 0.0015, + "num_tokens": 32702947.0, + "reward": 1.8888888359069824, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 622.25, + "completions/mean_terminated_length": 622.25, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.7238516878804648, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0311279296875, + "kl": 0.02323835517745465, + "learning_rate": 1.6107498663416888e-05, + "loss": 0.0009, + "num_tokens": 32717909.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 334.875, + "completions/mean_terminated_length": 334.875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7240361556908319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.043910090113058686, + "learning_rate": 1.610494880584218e-05, + "loss": 0.0018, + "num_tokens": 32727604.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 177.125, + "completions/mean_terminated_length": 177.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.7242206235011991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.042537749744951725, + "learning_rate": 1.6102398315346976e-05, + "loss": 0.0017, + "num_tokens": 32732269.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 248.875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.7244050913115662, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.057756332447752357, + "learning_rate": 1.6099847192195696e-05, + "loss": 0.0023, + "num_tokens": 32738908.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 414.75, + "completions/mean_terminated_length": 414.75, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.7245895591219332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.05562214110977948, + "learning_rate": 1.6097295436652825e-05, + "loss": 0.0022, + "num_tokens": 32748842.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1220.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 505.25, + "completions/mean_terminated_length": 505.25, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.7247740269323003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.014650968718342483, + "learning_rate": 1.6094743048982914e-05, + "loss": 0.0006, + "num_tokens": 32758188.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 311.375, + "completions/mean_terminated_length": 311.375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7249584947426674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.044432720518670976, + "learning_rate": 1.609219002945057e-05, + "loss": 0.0018, + "num_tokens": 32768159.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 391.25, + "completions/mean_terminated_length": 391.25, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7251429625530345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.058671939419582486, + "learning_rate": 1.608963637832048e-05, + "loss": 0.0023, + "num_tokens": 32778881.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 316.5, + "completions/mean_terminated_length": 316.5, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.7253274303634016, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.044165000319480896, + "learning_rate": 1.608708209585739e-05, + "loss": 0.0018, + "num_tokens": 32787349.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 172.0, + "completions/mean_terminated_length": 172.0, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.7255118981737687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1484375, + "kl": 0.066458644811064, + "learning_rate": 1.60845271823261e-05, + "loss": 0.0027, + "num_tokens": 32791589.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 292.25, + "completions/mean_terminated_length": 292.25, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7256963659841358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1220703125, + "kl": 0.05199770093895495, + "learning_rate": 1.60819716379915e-05, + "loss": 0.0021, + "num_tokens": 32798135.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 193.625, + "completions/mean_terminated_length": 193.625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.7258808337945029, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.04387250286526978, + "learning_rate": 1.6079415463118525e-05, + "loss": 0.0018, + "num_tokens": 32802428.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 206.125, + "completions/mean_terminated_length": 206.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.7260653016048699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.376953125, + "kl": 0.04628057649824768, + "learning_rate": 1.607685865797218e-05, + "loss": 0.0019, + "num_tokens": 32806917.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 330.625, + "completions/mean_terminated_length": 330.625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.726249769415237, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.027005181298591197, + "learning_rate": 1.607430122281755e-05, + "loss": 0.0011, + "num_tokens": 32814898.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 273.25, + "completions/mean_terminated_length": 273.25, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.7264342372256042, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.65625, + "kl": 0.08047004439868033, + "learning_rate": 1.6071743157919757e-05, + "loss": 0.0032, + "num_tokens": 32824844.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 382.625, + "completions/mean_terminated_length": 382.625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7266187050359713, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.056424317648634315, + "learning_rate": 1.6069184463544013e-05, + "loss": 0.0023, + "num_tokens": 32835305.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 304.625, + "completions/mean_terminated_length": 304.625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.7268031728463383, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.138671875, + "kl": 0.03463415149599314, + "learning_rate": 1.6066625139955584e-05, + "loss": 0.0014, + "num_tokens": 32845406.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 290.125, + "completions/mean_terminated_length": 290.125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.7269876406567054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.041150896809995174, + "learning_rate": 1.6064065187419807e-05, + "loss": 0.0016, + "num_tokens": 32854311.0, + "reward": 1.2666666507720947, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2666666805744171, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.7271721084670725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.04893456865102053, + "learning_rate": 1.6061504606202073e-05, + "loss": 0.002, + "num_tokens": 32862011.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 295.625, + "completions/mean_terminated_length": 295.625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7273565762774395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.026307484367862344, + "learning_rate": 1.6058943396567857e-05, + "loss": 0.0011, + "num_tokens": 32868080.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 177.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 149.875, + "completions/mean_terminated_length": 149.875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.7275410440878067, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.02838913816958666, + "learning_rate": 1.605638155878268e-05, + "loss": 0.0011, + "num_tokens": 32872239.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 201.5, + "completions/mean_terminated_length": 201.5, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.7277255118981738, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.173828125, + "kl": 0.08450439060106874, + "learning_rate": 1.605381909311214e-05, + "loss": 0.0034, + "num_tokens": 32876867.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 392.0, + "completions/mean_terminated_length": 392.0, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.7279099797085409, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98046875, + "kl": 0.029690427938476205, + "learning_rate": 1.6051255999821892e-05, + "loss": 0.0012, + "num_tokens": 32886107.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 383.125, + "completions/mean_terminated_length": 383.125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.728094447518908, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.03432905371300876, + "learning_rate": 1.6048692279177664e-05, + "loss": 0.0014, + "num_tokens": 32897332.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 244.0, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.728278915329275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.02945005102083087, + "learning_rate": 1.6046127931445245e-05, + "loss": 0.0012, + "num_tokens": 32902932.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 192.5, + "completions/mean_terminated_length": 192.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.7284633831396421, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.04274723259732127, + "learning_rate": 1.604356295689049e-05, + "loss": 0.0017, + "num_tokens": 32907480.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 376.5, + "completions/mean_terminated_length": 376.5, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7286478509500092, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.04971313290297985, + "learning_rate": 1.6040997355779316e-05, + "loss": 0.002, + "num_tokens": 32917500.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 556.5, + "completions/mean_terminated_length": 556.5, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.7288323187603764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.028119852184318006, + "learning_rate": 1.6038431128377713e-05, + "loss": 0.0011, + "num_tokens": 32931240.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 321.875, + "completions/mean_terminated_length": 321.875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7290167865707434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.02718433376867324, + "learning_rate": 1.6035864274951728e-05, + "loss": 0.0011, + "num_tokens": 32938223.0, + "reward": 1.7058823108673096, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823704719543, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.7292012543811105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.06950789829716086, + "learning_rate": 1.603329679576747e-05, + "loss": 0.0028, + "num_tokens": 32946657.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 234.375, + "completions/mean_terminated_length": 234.375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.7293857221914776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03142592078074813, + "learning_rate": 1.6030728691091124e-05, + "loss": 0.0013, + "num_tokens": 32951876.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.7295701900018446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.027054836857132614, + "learning_rate": 1.6028159961188934e-05, + "loss": 0.0011, + "num_tokens": 32958155.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 546.75, + "completions/mean_terminated_length": 546.75, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.7297546578122117, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6953125, + "kl": 0.025944420252926648, + "learning_rate": 1.6025590606327208e-05, + "loss": 0.001, + "num_tokens": 32968433.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 506.5, + "completions/mean_terminated_length": 506.5, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.7299391256225789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.028234106488525867, + "learning_rate": 1.6023020626772313e-05, + "loss": 0.0011, + "num_tokens": 32981045.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 379.25, + "completions/mean_terminated_length": 379.25, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.730123593432946, + "frac_reward_zero_std": 1.0, + "grad_norm": 8.1875, + "kl": 0.245445808628574, + "learning_rate": 1.6020450022790695e-05, + "loss": 0.0098, + "num_tokens": 32993719.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 322.875, + "completions/mean_terminated_length": 322.875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.730308061243313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.053407154977321625, + "learning_rate": 1.6017878794648856e-05, + "loss": 0.0021, + "num_tokens": 33001502.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 146.75, + "completions/mean_terminated_length": 146.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.7304925290536801, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.0243281185394153, + "learning_rate": 1.6015306942613363e-05, + "loss": 0.001, + "num_tokens": 33005396.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 191.625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.7306769968640472, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.09135144017636776, + "learning_rate": 1.6012734466950852e-05, + "loss": 0.0037, + "num_tokens": 33009673.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 363.625, + "completions/mean_terminated_length": 363.625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.7308614646744143, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054443359375, + "kl": 0.03955088322982192, + "learning_rate": 1.6010161367928017e-05, + "loss": 0.0016, + "num_tokens": 33016646.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 150.875, + "completions/mean_terminated_length": 150.875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.7310459324847814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.04728124290704727, + "learning_rate": 1.6007587645811614e-05, + "loss": 0.0019, + "num_tokens": 33020725.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 402.75, + "completions/mean_terminated_length": 402.75, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7312304002951485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2001953125, + "kl": 0.02095779002411291, + "learning_rate": 1.600501330086848e-05, + "loss": 0.0008, + "num_tokens": 33028123.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 286.5, + "completions/mean_terminated_length": 286.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7314148681055156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.09113500593230128, + "learning_rate": 1.60024383333655e-05, + "loss": 0.0036, + "num_tokens": 33036711.0, + "reward": 1.0714285373687744, + "reward_std": 0.07636039704084396, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0714285746216774, + "rewards/fixed_code_pass_all_test_reward/std": 0.07636035978794098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 208.375, + "completions/mean_terminated_length": 208.375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.7315993359158827, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.05243783490732312, + "learning_rate": 1.5999862743569626e-05, + "loss": 0.0021, + "num_tokens": 33045330.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 151.625, + "completions/mean_terminated_length": 151.625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.7317838037262497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11181640625, + "kl": 0.045014011673629284, + "learning_rate": 1.599728653174789e-05, + "loss": 0.0018, + "num_tokens": 33049503.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 241.25, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.7319682715366168, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.0528943941462785, + "learning_rate": 1.5994709698167363e-05, + "loss": 0.0021, + "num_tokens": 33058313.0, + "reward": 1.514423131942749, + "reward_std": 0.5204757452011108, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5144230723381042, + "rewards/fixed_code_pass_all_test_reward/std": 0.5204757452011108, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 294.25, + "completions/mean_terminated_length": 294.25, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.732152739346984, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05078125, + "kl": 0.040799153968691826, + "learning_rate": 1.5992132243095203e-05, + "loss": 0.0016, + "num_tokens": 33067803.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.7323372071573511, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.0640311362221837, + "learning_rate": 1.598955416679862e-05, + "loss": 0.0026, + "num_tokens": 33075818.0, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 493.25, + "completions/mean_terminated_length": 493.25, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.7325216749677181, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.03732651798054576, + "learning_rate": 1.598697546954489e-05, + "loss": 0.0015, + "num_tokens": 33090156.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 464.625, + "completions/mean_terminated_length": 464.625, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.7327061427780852, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.027838158421218395, + "learning_rate": 1.598439615160136e-05, + "loss": 0.0011, + "num_tokens": 33103361.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 627.75, + "completions/mean_terminated_length": 627.75, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.7328906105884523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6484375, + "kl": 0.03472438082098961, + "learning_rate": 1.5981816213235432e-05, + "loss": 0.0014, + "num_tokens": 33123719.0, + "reward": 1.5201612710952759, + "reward_std": 0.3997313976287842, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5201612710952759, + "rewards/fixed_code_pass_all_test_reward/std": 0.39973142743110657, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 136.125, + "completions/mean_terminated_length": 136.125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.7330750783988194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.01801547675859183, + "learning_rate": 1.597923565471458e-05, + "loss": 0.0007, + "num_tokens": 33127512.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 509.625, + "completions/mean_terminated_length": 509.625, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.7332595462091865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.028727317927405238, + "learning_rate": 1.5976654476306338e-05, + "loss": 0.0011, + "num_tokens": 33137493.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.7334440140195536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.04967854171991348, + "learning_rate": 1.5974072678278306e-05, + "loss": 0.002, + "num_tokens": 33144358.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 551.5, + "completions/mean_terminated_length": 551.5, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.7336284818299207, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71484375, + "kl": 0.040022993460297585, + "learning_rate": 1.5971490260898142e-05, + "loss": 0.0016, + "num_tokens": 33158002.0, + "reward": 1.4953703880310059, + "reward_std": 0.6347249150276184, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6203703880310059, + "rewards/fixed_code_pass_all_test_reward/std": 0.31721773743629456, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 174.625, + "completions/mean_terminated_length": 174.625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.7338129496402878, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.388671875, + "kl": 0.06516158045269549, + "learning_rate": 1.5968907224433585e-05, + "loss": 0.0026, + "num_tokens": 33162151.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7339974174506548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0615234375, + "kl": 0.04679653234779835, + "learning_rate": 1.596632356915242e-05, + "loss": 0.0019, + "num_tokens": 33168908.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 402.5, + "completions/mean_terminated_length": 402.5, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.7341818852610219, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.035484156222082675, + "learning_rate": 1.5963739295322504e-05, + "loss": 0.0014, + "num_tokens": 33180096.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 315.75, + "completions/mean_terminated_length": 315.75, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.7343663530713891, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.640625, + "kl": 0.054780554957687855, + "learning_rate": 1.5961154403211755e-05, + "loss": 0.0022, + "num_tokens": 33188270.0, + "reward": 1.892045497894287, + "reward_std": 0.30534157156944275, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8920454382896423, + "rewards/fixed_code_pass_all_test_reward/std": 0.30534157156944275, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 311.25, + "completions/mean_terminated_length": 311.25, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7345508208817562, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06549424352124333, + "learning_rate": 1.595856889308816e-05, + "loss": 0.0026, + "num_tokens": 33194864.0, + "reward": 1.9431818723678589, + "reward_std": 0.047049880027770996, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9431818723678589, + "rewards/fixed_code_pass_all_test_reward/std": 0.047049909830093384, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 120.75, + "completions/mean_terminated_length": 120.75, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.7347352886921232, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.07016842905431986, + "learning_rate": 1.5955982765219768e-05, + "loss": 0.0028, + "num_tokens": 33198566.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 362.0, + "completions/mean_terminated_length": 362.0, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.7349197565024903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.027329481905326247, + "learning_rate": 1.595339601987469e-05, + "loss": 0.0011, + "num_tokens": 33208670.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 301.375, + "completions/mean_terminated_length": 301.375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.7351042243128574, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5, + "kl": 0.048960990039631724, + "learning_rate": 1.595080865732111e-05, + "loss": 0.002, + "num_tokens": 33217337.0, + "reward": 1.8799999952316284, + "reward_std": 0.24657657742500305, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8799999952316284, + "rewards/fixed_code_pass_all_test_reward/std": 0.24657656252384186, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 253.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.7352886921232245, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.053261857479810715, + "learning_rate": 1.5948220677827253e-05, + "loss": 0.0021, + "num_tokens": 33223218.0, + "reward": 1.9736841917037964, + "reward_std": 0.048727408051490784, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9736841917037964, + "rewards/fixed_code_pass_all_test_reward/std": 0.048727381974458694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 157.0, + "completions/mean_terminated_length": 157.0, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.7354731599335916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5859375, + "kl": 0.09905620105564594, + "learning_rate": 1.5945632081661436e-05, + "loss": 0.004, + "num_tokens": 33227330.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 309.75, + "completions/mean_terminated_length": 309.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.7356576277439587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.095703125, + "kl": 0.028455681866034865, + "learning_rate": 1.5943042869092024e-05, + "loss": 0.0011, + "num_tokens": 33234024.0, + "reward": 1.6881721019744873, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6881720423698425, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7358420955543258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.015687916544266045, + "learning_rate": 1.5940453040387448e-05, + "loss": 0.0006, + "num_tokens": 33239623.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.7360265633646929, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.027939641615375876, + "learning_rate": 1.593786259581621e-05, + "loss": 0.0011, + "num_tokens": 33243829.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 441.25, + "completions/mean_terminated_length": 441.25, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.7362110311750599, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.028994532534852624, + "learning_rate": 1.5935271535646858e-05, + "loss": 0.0012, + "num_tokens": 33250391.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 236.125, + "completions/mean_terminated_length": 236.125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.736395498985427, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.041998119675554335, + "learning_rate": 1.593267986014803e-05, + "loss": 0.0017, + "num_tokens": 33255400.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 291.5, + "completions/mean_terminated_length": 291.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.7365799667957942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047607421875, + "kl": 0.025974841555580497, + "learning_rate": 1.5930087569588403e-05, + "loss": 0.001, + "num_tokens": 33265276.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 385.25, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7367644346061613, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.13659921940416098, + "learning_rate": 1.5927494664236735e-05, + "loss": 0.0055, + "num_tokens": 33274598.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 435.375, + "completions/mean_terminated_length": 435.375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.7369489024165283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.050163556821644306, + "learning_rate": 1.592490114436184e-05, + "loss": 0.002, + "num_tokens": 33287065.0, + "reward": 1.9147727489471436, + "reward_std": 0.24105912446975708, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9147727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.24105913937091827, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 287.0, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.7371333702268954, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.0366561864502728, + "learning_rate": 1.5922307010232593e-05, + "loss": 0.0015, + "num_tokens": 33293201.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 432.625, + "completions/mean_terminated_length": 432.625, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.7373178380372625, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.018781858030706644, + "learning_rate": 1.591971226211794e-05, + "loss": 0.0008, + "num_tokens": 33301302.0, + "reward": 1.9318182468414307, + "reward_std": 0.042082689702510834, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9318182468414307, + "rewards/fixed_code_pass_all_test_reward/std": 0.04208271950483322, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 689.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 561.125, + "completions/mean_terminated_length": 561.125, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.7375023058476295, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.04009233810938895, + "learning_rate": 1.5917116900286885e-05, + "loss": 0.0016, + "num_tokens": 33311311.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 499.125, + "completions/mean_terminated_length": 499.125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.7376867736579967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.03048451093491167, + "learning_rate": 1.59145209250085e-05, + "loss": 0.0012, + "num_tokens": 33325704.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 436.625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.7378712414683638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.04478382761590183, + "learning_rate": 1.5911924336551918e-05, + "loss": 0.0018, + "num_tokens": 33333549.0, + "reward": 1.795454502105713, + "reward_std": 0.37874454259872437, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7954545617103577, + "rewards/fixed_code_pass_all_test_reward/std": 0.37874457240104675, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 245.125, + "completions/mean_terminated_length": 245.125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.7380557092787309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.03336609178222716, + "learning_rate": 1.5909327135186337e-05, + "loss": 0.0013, + "num_tokens": 33338958.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 255.0, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.738240177089098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.0220868110191077, + "learning_rate": 1.5906729321181017e-05, + "loss": 0.0009, + "num_tokens": 33343702.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 152.5, + "completions/mean_terminated_length": 152.5, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.738424644899465, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.11390826385468245, + "learning_rate": 1.5904130894805278e-05, + "loss": 0.0046, + "num_tokens": 33347754.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 438.125, + "completions/mean_terminated_length": 438.125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.7386091127098321, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8125, + "kl": 0.017839289736002684, + "learning_rate": 1.5901531856328512e-05, + "loss": 0.0007, + "num_tokens": 33359747.0, + "reward": 1.841346263885498, + "reward_std": 0.21896308660507202, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8413461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.2189631313085556, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 325.875, + "completions/mean_terminated_length": 325.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.7387935805201993, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09228515625, + "kl": 0.054490976966917515, + "learning_rate": 1.5898932206020173e-05, + "loss": 0.0022, + "num_tokens": 33366466.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 415.75, + "completions/mean_terminated_length": 415.75, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7389780483305664, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.06692877889145166, + "learning_rate": 1.5896331944149768e-05, + "loss": 0.0027, + "num_tokens": 33376192.0, + "reward": 1.4375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 239.625, + "completions/mean_terminated_length": 239.625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.7391625161409334, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.4375, + "kl": 0.02966914139688015, + "learning_rate": 1.5893731070986875e-05, + "loss": 0.0012, + "num_tokens": 33382005.0, + "reward": 1.5384615659713745, + "reward_std": 0.4934053421020508, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5384615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.49340540170669556, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 743.625, + "completions/mean_terminated_length": 743.625, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "epoch": 0.7393469839513005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.03560088959056884, + "learning_rate": 1.5891129586801143e-05, + "loss": 0.0014, + "num_tokens": 33396050.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 318.875, + "completions/mean_terminated_length": 318.875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.7395314517616676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03857421875, + "kl": 0.0306621128693223, + "learning_rate": 1.588852749186227e-05, + "loss": 0.0012, + "num_tokens": 33401785.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 371.0, + "completions/mean_terminated_length": 371.0, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.7397159195720346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.04600727395154536, + "learning_rate": 1.588592478644003e-05, + "loss": 0.0018, + "num_tokens": 33408785.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 499.875, + "completions/mean_terminated_length": 499.875, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.7399003873824018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.77734375, + "kl": 0.031372192897833884, + "learning_rate": 1.5883321470804248e-05, + "loss": 0.0013, + "num_tokens": 33420552.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 304.75, + "completions/mean_terminated_length": 304.75, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7400848551927689, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.0462660975754261, + "learning_rate": 1.5880717545224817e-05, + "loss": 0.0019, + "num_tokens": 33428598.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 313.625, + "completions/mean_terminated_length": 313.625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.740269323003136, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1689453125, + "kl": 0.051337195094674826, + "learning_rate": 1.5878113009971704e-05, + "loss": 0.0021, + "num_tokens": 33437051.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 300.875, + "completions/mean_terminated_length": 300.875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.740453790813503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0771484375, + "kl": 0.03495763521641493, + "learning_rate": 1.587550786531492e-05, + "loss": 0.0014, + "num_tokens": 33446138.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.0, + "completions/max_terminated_length": 997.0, + "completions/mean_length": 456.75, + "completions/mean_terminated_length": 456.75, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.7406382586238701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.048813389614224434, + "learning_rate": 1.5872902111524556e-05, + "loss": 0.002, + "num_tokens": 33456632.0, + "reward": 1.7857142686843872, + "reward_std": 0.3295460343360901, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7857142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.3295460343360901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.7408227264342372, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.0294568482786417, + "learning_rate": 1.5870295748870756e-05, + "loss": 0.0012, + "num_tokens": 33461476.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 252.125, + "completions/mean_terminated_length": 252.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7410071942446043, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.05165248294360936, + "learning_rate": 1.5867688777623728e-05, + "loss": 0.0021, + "num_tokens": 33466741.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 362.125, + "completions/mean_terminated_length": 362.125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7411916620549714, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.04054518206976354, + "learning_rate": 1.586508119805375e-05, + "loss": 0.0016, + "num_tokens": 33473342.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 386.75, + "completions/mean_terminated_length": 386.75, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.7413761298653385, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.038770116632804275, + "learning_rate": 1.586247301043116e-05, + "loss": 0.0016, + "num_tokens": 33481156.0, + "reward": 1.4134615659713745, + "reward_std": 0.27254632115364075, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4134615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.27254632115364075, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 149.0, + "completions/mean_terminated_length": 149.0, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.7415605976757056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0908203125, + "kl": 0.03849170613102615, + "learning_rate": 1.585986421502635e-05, + "loss": 0.0015, + "num_tokens": 33485068.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 337.5, + "completions/mean_terminated_length": 337.5, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.7417450654860727, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.036965793929994106, + "learning_rate": 1.5857254812109788e-05, + "loss": 0.0015, + "num_tokens": 33495320.0, + "reward": 1.9821429252624512, + "reward_std": 0.05050760135054588, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 390.5, + "completions/mean_terminated_length": 390.5, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.7419295332964397, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042724609375, + "kl": 0.03305220138281584, + "learning_rate": 1.5854644801952003e-05, + "loss": 0.0013, + "num_tokens": 33505788.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 506.25, + "completions/mean_terminated_length": 506.25, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.7421140011068068, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.02479924855288118, + "learning_rate": 1.585203418482357e-05, + "loss": 0.001, + "num_tokens": 33515262.0, + "reward": 1.8909574747085571, + "reward_std": 0.044684480875730515, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8909574747085571, + "rewards/fixed_code_pass_all_test_reward/std": 0.04468446597456932, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 408.75, + "completions/mean_terminated_length": 408.75, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.742298468917174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.046957595739513636, + "learning_rate": 1.5849422960995157e-05, + "loss": 0.0019, + "num_tokens": 33523068.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 117.375, + "completions/mean_terminated_length": 117.375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.7424829367275411, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2578125, + "kl": 0.168346063233912, + "learning_rate": 1.584681113073747e-05, + "loss": 0.0067, + "num_tokens": 33526663.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 187.0, + "completions/mean_terminated_length": 187.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.7426674045379081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.07877808157354593, + "learning_rate": 1.5844198694321283e-05, + "loss": 0.0032, + "num_tokens": 33530983.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 478.0, + "completions/mean_terminated_length": 478.0, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7428518723482752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.56640625, + "kl": 0.023095287615433335, + "learning_rate": 1.5841585652017445e-05, + "loss": 0.0009, + "num_tokens": 33542303.0, + "reward": 1.9525315761566162, + "reward_std": 0.13426080346107483, + "rewards/fixed_code_pass_all_test_reward/mean": 0.952531635761261, + "rewards/fixed_code_pass_all_test_reward/std": 0.13426078855991364, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 370.0, + "completions/mean_terminated_length": 370.0, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.7430363401586423, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04638671875, + "kl": 0.026699120178818703, + "learning_rate": 1.583897200409685e-05, + "loss": 0.0011, + "num_tokens": 33553039.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 329.75, + "completions/mean_terminated_length": 329.75, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7432208079690094, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.025468763895332813, + "learning_rate": 1.5836357750830467e-05, + "loss": 0.001, + "num_tokens": 33562429.0, + "reward": 1.7472221851348877, + "reward_std": 0.10213766247034073, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7472221851348877, + "rewards/fixed_code_pass_all_test_reward/std": 0.10213764011859894, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 292.0, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7434052757793765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.03593021538108587, + "learning_rate": 1.5833742892489328e-05, + "loss": 0.0014, + "num_tokens": 33572309.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 368.625, + "completions/mean_terminated_length": 368.625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7435897435897436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.03660622111056, + "learning_rate": 1.5831127429344516e-05, + "loss": 0.0015, + "num_tokens": 33580282.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 148.875, + "completions/mean_terminated_length": 148.875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.7437742114001107, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.02688920369837433, + "learning_rate": 1.5828511361667194e-05, + "loss": 0.0011, + "num_tokens": 33584409.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 132.0, + "completions/mean_terminated_length": 132.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.7439586792104778, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.15625, + "kl": 0.06510383123531938, + "learning_rate": 1.5825894689728575e-05, + "loss": 0.0026, + "num_tokens": 33588129.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 202.625, + "completions/mean_terminated_length": 202.625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.7441431470208448, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.06580345064867288, + "learning_rate": 1.5823277413799937e-05, + "loss": 0.0026, + "num_tokens": 33594390.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 164.75, + "completions/mean_terminated_length": 164.75, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.7443276148312119, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3125, + "kl": 0.058121299371123314, + "learning_rate": 1.582065953415262e-05, + "loss": 0.0023, + "num_tokens": 33598492.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 124.375, + "completions/mean_terminated_length": 124.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.7445120826415791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.05001427954994142, + "learning_rate": 1.5818041051058034e-05, + "loss": 0.002, + "num_tokens": 33602343.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 298.25, + "completions/mean_terminated_length": 298.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7446965504519462, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.032082814374007285, + "learning_rate": 1.581542196478764e-05, + "loss": 0.0013, + "num_tokens": 33608777.0, + "reward": 1.3499999046325684, + "reward_std": 0.1414213478565216, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3500000238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 384.0, + "completions/mean_terminated_length": 384.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7448810182623132, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.06394024123437703, + "learning_rate": 1.5812802275612972e-05, + "loss": 0.0026, + "num_tokens": 33616113.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 430.25, + "completions/mean_terminated_length": 430.25, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.7450654860726803, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.05865313624963164, + "learning_rate": 1.581018198380562e-05, + "loss": 0.0023, + "num_tokens": 33632035.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 456.75, + "completions/mean_terminated_length": 456.75, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.7452499538830474, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.019989887019619346, + "learning_rate": 1.5807561089637232e-05, + "loss": 0.0008, + "num_tokens": 33639705.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 250.125, + "completions/mean_terminated_length": 250.125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.7454344216934145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.734375, + "kl": 0.027710677939467132, + "learning_rate": 1.5804939593379534e-05, + "loss": 0.0011, + "num_tokens": 33646186.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 309.0, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.7456188895037816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.06420320738106966, + "learning_rate": 1.5802317495304304e-05, + "loss": 0.0026, + "num_tokens": 33656594.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.7458033573141487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.05376526340842247, + "learning_rate": 1.579969479568338e-05, + "loss": 0.0022, + "num_tokens": 33660877.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 374.0, + "completions/mean_terminated_length": 374.0, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7459878251245158, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.05379949533380568, + "learning_rate": 1.579707149478867e-05, + "loss": 0.0022, + "num_tokens": 33671309.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 422.375, + "completions/mean_terminated_length": 422.375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.7461722929348829, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91796875, + "kl": 0.08552798861637712, + "learning_rate": 1.579444759289214e-05, + "loss": 0.0034, + "num_tokens": 33679888.0, + "reward": 1.6953125, + "reward_std": 0.3696606457233429, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3696606457233429, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 248.375, + "completions/mean_terminated_length": 248.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.7463567607452499, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1259765625, + "kl": 0.03387106506852433, + "learning_rate": 1.5791823090265817e-05, + "loss": 0.0014, + "num_tokens": 33684883.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 152.0, + "completions/mean_terminated_length": 152.0, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.746541228555617, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.028350495267659426, + "learning_rate": 1.5789197987181792e-05, + "loss": 0.0011, + "num_tokens": 33688867.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 286.125, + "completions/mean_terminated_length": 286.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.7467256963659842, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.028165154508315027, + "learning_rate": 1.578657228391222e-05, + "loss": 0.0011, + "num_tokens": 33697628.0, + "reward": 1.65234375, + "reward_std": 0.29725655913352966, + "rewards/fixed_code_pass_all_test_reward/mean": 0.77734375, + "rewards/fixed_code_pass_all_test_reward/std": 0.06823570281267166, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 458.25, + "completions/mean_terminated_length": 458.25, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.7469101641763513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.05670161754824221, + "learning_rate": 1.578394598072931e-05, + "loss": 0.0023, + "num_tokens": 33708694.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 318.625, + "completions/mean_terminated_length": 318.625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.7470946319867183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.04341883957386017, + "learning_rate": 1.5781319077905347e-05, + "loss": 0.0017, + "num_tokens": 33716899.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 483.375, + "completions/mean_terminated_length": 483.375, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.7472790997970854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9609375, + "kl": 0.033641395973972976, + "learning_rate": 1.577869157571267e-05, + "loss": 0.0013, + "num_tokens": 33728206.0, + "reward": 1.1875, + "reward_std": 0.3282996118068695, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3282995820045471, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 268.125, + "completions/mean_terminated_length": 268.125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7474635676074525, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.05429615546017885, + "learning_rate": 1.5776063474423677e-05, + "loss": 0.0022, + "num_tokens": 33736679.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 184.0, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.7476480354178195, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.0604194737970829, + "learning_rate": 1.5773434774310835e-05, + "loss": 0.0024, + "num_tokens": 33740919.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 487.75, + "completions/mean_terminated_length": 487.75, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7478325032281867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.03337793389800936, + "learning_rate": 1.577080547564667e-05, + "loss": 0.0013, + "num_tokens": 33749141.0, + "reward": 1.529761791229248, + "reward_std": 0.13810105621814728, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5297619104385376, + "rewards/fixed_code_pass_all_test_reward/std": 0.13810110092163086, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.7480169710385538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.06465550232678652, + "learning_rate": 1.5768175578703773e-05, + "loss": 0.0026, + "num_tokens": 33754750.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 377.625, + "completions/mean_terminated_length": 377.625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.7482014388489209, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.051934156101197004, + "learning_rate": 1.5765545083754784e-05, + "loss": 0.0021, + "num_tokens": 33766027.0, + "reward": 1.0925925970077515, + "reward_std": 0.058560676872730255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.09259258955717087, + "rewards/fixed_code_pass_all_test_reward/std": 0.058560699224472046, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 352.875, + "completions/mean_terminated_length": 352.875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.748385906659288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.04148395196534693, + "learning_rate": 1.576291399107243e-05, + "loss": 0.0017, + "num_tokens": 33775114.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 320.25, + "completions/mean_terminated_length": 320.25, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.748570374469655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306640625, + "kl": 0.01985733606852591, + "learning_rate": 1.5760282300929474e-05, + "loss": 0.0008, + "num_tokens": 33781324.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.7487548422800221, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.84375, + "kl": 0.04690723423846066, + "learning_rate": 1.5757650013598755e-05, + "loss": 0.0019, + "num_tokens": 33786455.0, + "reward": 1.337499976158142, + "reward_std": 0.5316752195358276, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4625000059604645, + "rewards/fixed_code_pass_all_test_reward/std": 0.2875388264656067, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 566.0, + "completions/mean_terminated_length": 566.0, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.7489393100903893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.86328125, + "kl": 0.02554392023012042, + "learning_rate": 1.575501712935317e-05, + "loss": 0.001, + "num_tokens": 33796311.0, + "reward": 1.8208333253860474, + "reward_std": 0.09417565912008286, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8208333253860474, + "rewards/fixed_code_pass_all_test_reward/std": 0.09417562186717987, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 426.625, + "completions/mean_terminated_length": 426.625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.7491237779007563, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.03370983060449362, + "learning_rate": 1.5752383648465682e-05, + "loss": 0.0013, + "num_tokens": 33804868.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 555.25, + "completions/mean_terminated_length": 555.25, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.7493082457111234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.671875, + "kl": 0.027929088100790977, + "learning_rate": 1.5749749571209313e-05, + "loss": 0.0011, + "num_tokens": 33814678.0, + "reward": 1.6688311100006104, + "reward_std": 0.19902664422988892, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6688311696052551, + "rewards/fixed_code_pass_all_test_reward/std": 0.19902662932872772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 276.5, + "completions/mean_terminated_length": 276.5, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.7494927135214905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.032012793235480785, + "learning_rate": 1.5747114897857145e-05, + "loss": 0.0013, + "num_tokens": 33824770.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 576.125, + "completions/mean_terminated_length": 576.125, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.7496771813318576, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.031022879295051098, + "learning_rate": 1.5744479628682325e-05, + "loss": 0.0012, + "num_tokens": 33837291.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 214.25, + "completions/mean_terminated_length": 214.25, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.7498616491422246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04541015625, + "kl": 0.01937434310093522, + "learning_rate": 1.5741843763958053e-05, + "loss": 0.0008, + "num_tokens": 33842021.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 287.375, + "completions/mean_terminated_length": 287.375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.7500461169525918, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.027074200217612088, + "learning_rate": 1.5739207303957606e-05, + "loss": 0.0011, + "num_tokens": 33848144.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 422.625, + "completions/mean_terminated_length": 422.625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.7502305847629589, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.04087367851752788, + "learning_rate": 1.5736570248954312e-05, + "loss": 0.0016, + "num_tokens": 33856221.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 344.875, + "completions/mean_terminated_length": 344.875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.750415052573326, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2578125, + "kl": 0.06644891854375601, + "learning_rate": 1.5733932599221566e-05, + "loss": 0.0027, + "num_tokens": 33867116.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 186.25, + "completions/mean_terminated_length": 186.25, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.750599520383693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1455078125, + "kl": 0.023051332333125174, + "learning_rate": 1.5731294355032813e-05, + "loss": 0.0009, + "num_tokens": 33871462.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 294.875, + "completions/mean_terminated_length": 294.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.7507839881940601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.05594278662465513, + "learning_rate": 1.5728655516661584e-05, + "loss": 0.0022, + "num_tokens": 33880405.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 384.75, + "completions/mean_terminated_length": 384.75, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.7509684560044272, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.02845495706424117, + "learning_rate": 1.5726016084381438e-05, + "loss": 0.0011, + "num_tokens": 33891835.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 823.5, + "completions/mean_terminated_length": 823.5, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "epoch": 0.7511529238147944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.043188205221667886, + "learning_rate": 1.572337605846603e-05, + "loss": 0.0017, + "num_tokens": 33910263.0, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 330.75, + "completions/mean_terminated_length": 330.75, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7513373916251614, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.08610758441500366, + "learning_rate": 1.572073543918905e-05, + "loss": 0.0034, + "num_tokens": 33920469.0, + "reward": 1.221153974533081, + "reward_std": 0.02719642035663128, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2211538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 346.0, + "completions/mean_terminated_length": 346.0, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.7515218594355285, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11328125, + "kl": 0.07740178680978715, + "learning_rate": 1.5718094226824264e-05, + "loss": 0.0031, + "num_tokens": 33930485.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 258.375, + "completions/mean_terminated_length": 258.375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.7517063272458956, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.07915871846489608, + "learning_rate": 1.5715452421645494e-05, + "loss": 0.0032, + "num_tokens": 33935968.0, + "reward": 1.9500000476837158, + "reward_std": 0.09258202463388443, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.09258200973272324, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 179.25, + "completions/mean_terminated_length": 179.25, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.7518907950562627, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.057063727639615536, + "learning_rate": 1.5712810023926628e-05, + "loss": 0.0023, + "num_tokens": 33940202.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 385.75, + "completions/mean_terminated_length": 385.75, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7520752628666297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.03737764130346477, + "learning_rate": 1.5710167033941607e-05, + "loss": 0.0015, + "num_tokens": 33947864.0, + "reward": 1.8430231809616089, + "reward_std": 0.12661686539649963, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8430233001708984, + "rewards/fixed_code_pass_all_test_reward/std": 0.12661688029766083, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1173.0, + "completions/max_terminated_length": 1173.0, + "completions/mean_length": 405.875, + "completions/mean_terminated_length": 405.875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7522597306769969, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.06536904047243297, + "learning_rate": 1.5707523451964442e-05, + "loss": 0.0026, + "num_tokens": 33959583.0, + "reward": 1.3641304969787598, + "reward_std": 0.6139689683914185, + "rewards/fixed_code_pass_all_test_reward/mean": 0.489130437374115, + "rewards/fixed_code_pass_all_test_reward/std": 0.3349721133708954, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 557.875, + "completions/mean_terminated_length": 557.875, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.752444198487364, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.03108978015370667, + "learning_rate": 1.5704879278269197e-05, + "loss": 0.0012, + "num_tokens": 33969558.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 462.5, + "completions/mean_terminated_length": 462.5, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.7526286662977311, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.028140490758232772, + "learning_rate": 1.570223451313001e-05, + "loss": 0.0011, + "num_tokens": 33982458.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 288.625, + "completions/mean_terminated_length": 288.625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7528131341080981, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.0488034060690552, + "learning_rate": 1.5699589156821074e-05, + "loss": 0.002, + "num_tokens": 33989015.0, + "reward": 1.5957446098327637, + "reward_std": 0.23637956380844116, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5957446694374084, + "rewards/fixed_code_pass_all_test_reward/std": 0.23637959361076355, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 331.625, + "completions/mean_terminated_length": 331.625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7529976019184652, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.04167483979836106, + "learning_rate": 1.5696943209616632e-05, + "loss": 0.0017, + "num_tokens": 34000532.0, + "reward": 1.9950000047683716, + "reward_std": 0.014142122119665146, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9950000047683716, + "rewards/fixed_code_pass_all_test_reward/std": 0.014142143540084362, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 619.25, + "completions/mean_terminated_length": 619.25, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.7531820697288323, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.03904522303491831, + "learning_rate": 1.5694296671791004e-05, + "loss": 0.0016, + "num_tokens": 34012558.0, + "reward": 1.89453125, + "reward_std": 0.29831066727638245, + "rewards/fixed_code_pass_all_test_reward/mean": 0.89453125, + "rewards/fixed_code_pass_all_test_reward/std": 0.29831069707870483, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 314.125, + "completions/mean_terminated_length": 314.125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.7533665375391994, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.02607435453683138, + "learning_rate": 1.5691649543618564e-05, + "loss": 0.001, + "num_tokens": 34018855.0, + "reward": 1.8229167461395264, + "reward_std": 0.1293872892856598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 449.375, + "completions/mean_terminated_length": 449.375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7535510053495665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.734375, + "kl": 0.03199944703374058, + "learning_rate": 1.568900182537375e-05, + "loss": 0.0013, + "num_tokens": 34031522.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 349.5, + "completions/mean_terminated_length": 349.5, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.7537354731599336, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8671875, + "kl": 0.0555715961381793, + "learning_rate": 1.5686353517331062e-05, + "loss": 0.0022, + "num_tokens": 34042726.0, + "reward": 1.65625, + "reward_std": 0.47442004084587097, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.47442010045051575, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 329.25, + "completions/mean_terminated_length": 329.25, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.7539199409703007, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.04379092110320926, + "learning_rate": 1.568370461976505e-05, + "loss": 0.0018, + "num_tokens": 34053104.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 353.625, + "completions/mean_terminated_length": 353.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7541044087806678, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.027351115364581347, + "learning_rate": 1.5681055132950347e-05, + "loss": 0.0011, + "num_tokens": 34062485.0, + "reward": 1.7272727489471436, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7272727489471436, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 284.125, + "completions/mean_terminated_length": 284.125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.7542888765910348, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0478515625, + "kl": 0.03654542029835284, + "learning_rate": 1.5678405057161625e-05, + "loss": 0.0015, + "num_tokens": 34076454.0, + "reward": 1.0952380895614624, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.095238097012043, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 395.625, + "completions/mean_terminated_length": 395.625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.7544733444014019, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.05040164804086089, + "learning_rate": 1.5675754392673626e-05, + "loss": 0.002, + "num_tokens": 34084083.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 530.5, + "completions/mean_terminated_length": 530.5, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.7546578122117691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.0252474487060681, + "learning_rate": 1.567310313976116e-05, + "loss": 0.001, + "num_tokens": 34094151.0, + "reward": 1.2083333730697632, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333283662796, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 240.125, + "completions/mean_terminated_length": 240.125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7548422800221362, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6171875, + "kl": 0.0407392093911767, + "learning_rate": 1.5670451298699085e-05, + "loss": 0.0016, + "num_tokens": 34099904.0, + "reward": 1.600961446762085, + "reward_std": 0.40544557571411133, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6009615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.40544557571411133, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 448.25, + "completions/mean_terminated_length": 448.25, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.7550267478325032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.032078812131658196, + "learning_rate": 1.5667798869762328e-05, + "loss": 0.0013, + "num_tokens": 34111778.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 420.625, + "completions/mean_terminated_length": 420.625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.7552112156428703, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031494140625, + "kl": 0.021950013004243374, + "learning_rate": 1.5665145853225876e-05, + "loss": 0.0009, + "num_tokens": 34121383.0, + "reward": 1.6666667461395264, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 366.75, + "completions/mean_terminated_length": 366.75, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.7553956834532374, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.051481425296515226, + "learning_rate": 1.5662492249364778e-05, + "loss": 0.0021, + "num_tokens": 34130477.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 521.0, + "completions/mean_terminated_length": 521.0, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.7555801512636044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.02077665668912232, + "learning_rate": 1.5659838058454136e-05, + "loss": 0.0008, + "num_tokens": 34142309.0, + "reward": 1.633802890777588, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6338028311729431, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 525.875, + "completions/mean_terminated_length": 525.875, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.7557646190739716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.03582437802106142, + "learning_rate": 1.5657183280769128e-05, + "loss": 0.0014, + "num_tokens": 34155484.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 112.0, + "completions/mean_terminated_length": 112.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.7559490868843387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.02201786037767306, + "learning_rate": 1.5654527916584972e-05, + "loss": 0.0009, + "num_tokens": 34159244.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 342.75, + "completions/mean_terminated_length": 342.75, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.7561335546947058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.03945993003435433, + "learning_rate": 1.565187196617697e-05, + "loss": 0.0016, + "num_tokens": 34166394.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 429.375, + "completions/mean_terminated_length": 429.375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.7563180225050729, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.03478241455741227, + "learning_rate": 1.5649215429820466e-05, + "loss": 0.0014, + "num_tokens": 34174605.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 246.625, + "completions/mean_terminated_length": 246.625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.7565024903154399, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.08544080378487706, + "learning_rate": 1.5646558307790877e-05, + "loss": 0.0034, + "num_tokens": 34180298.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 545.5, + "completions/mean_terminated_length": 545.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.756686958125807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.04805665370076895, + "learning_rate": 1.5643900600363666e-05, + "loss": 0.0019, + "num_tokens": 34189270.0, + "reward": 1.4375, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 367.5, + "completions/mean_terminated_length": 367.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7568714259361742, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.0419107002671808, + "learning_rate": 1.5641242307814382e-05, + "loss": 0.0017, + "num_tokens": 34200802.0, + "reward": 1.7727272510528564, + "reward_std": 0.3401506841182709, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7727272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.3401506841182709, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 257.25, + "completions/mean_terminated_length": 257.25, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7570558937465413, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.06570312357507646, + "learning_rate": 1.5638583430418603e-05, + "loss": 0.0026, + "num_tokens": 34210828.0, + "reward": 1.1964285373687744, + "reward_std": 0.3813242018222809, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1964285671710968, + "rewards/fixed_code_pass_all_test_reward/std": 0.38132426142692566, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 362.75, + "completions/mean_terminated_length": 362.75, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7572403615569083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.041695571737363935, + "learning_rate": 1.5635923968451996e-05, + "loss": 0.0017, + "num_tokens": 34220394.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 322.75, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7574248293672754, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10888671875, + "kl": 0.04562750272452831, + "learning_rate": 1.5633263922190266e-05, + "loss": 0.0018, + "num_tokens": 34229480.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 397.5, + "completions/mean_terminated_length": 397.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7576092971776425, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.03588966163806617, + "learning_rate": 1.56306032919092e-05, + "loss": 0.0014, + "num_tokens": 34240516.0, + "reward": 1.6477272510528564, + "reward_std": 0.32979726791381836, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6477272510528564, + "rewards/fixed_code_pass_all_test_reward/std": 0.32979726791381836, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 344.5, + "completions/mean_terminated_length": 344.5, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.7577937649880095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.037955476436764, + "learning_rate": 1.5627942077884627e-05, + "loss": 0.0015, + "num_tokens": 34247368.0, + "reward": 1.1538461446762085, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1538461595773697, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 285.625, + "completions/mean_terminated_length": 285.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.7579782327983767, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.04663580749183893, + "learning_rate": 1.562528028039245e-05, + "loss": 0.0019, + "num_tokens": 34255789.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 343.625, + "completions/mean_terminated_length": 343.625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.7581627006087438, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.04126089462079108, + "learning_rate": 1.5622617899708613e-05, + "loss": 0.0017, + "num_tokens": 34262914.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 258.875, + "completions/mean_terminated_length": 258.875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7583471684191109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.287109375, + "kl": 0.05121637345291674, + "learning_rate": 1.5619954936109148e-05, + "loss": 0.002, + "num_tokens": 34271169.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 443.125, + "completions/mean_terminated_length": 443.125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.758531636229478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.041525262175127864, + "learning_rate": 1.561729138987013e-05, + "loss": 0.0017, + "num_tokens": 34282634.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 362.75, + "completions/mean_terminated_length": 362.75, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.758716104039845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.05537239124532789, + "learning_rate": 1.5614627261267697e-05, + "loss": 0.0022, + "num_tokens": 34291928.0, + "reward": 1.8790322542190552, + "reward_std": 0.34214845299720764, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8790322542190552, + "rewards/fixed_code_pass_all_test_reward/std": 0.34214845299720764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 312.125, + "completions/mean_terminated_length": 312.125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.7589005718502121, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.038832023506984115, + "learning_rate": 1.5611962550578045e-05, + "loss": 0.0016, + "num_tokens": 34301137.0, + "reward": 1.943750023841858, + "reward_std": 0.1590990275144577, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9437500238418579, + "rewards/fixed_code_pass_all_test_reward/std": 0.1590990275144577, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 360.75, + "completions/mean_terminated_length": 360.75, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7590850396605793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.03992340853437781, + "learning_rate": 1.5609297258077436e-05, + "loss": 0.0016, + "num_tokens": 34310791.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7592695074709463, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.038001936743967235, + "learning_rate": 1.560663138404219e-05, + "loss": 0.0015, + "num_tokens": 34316603.0, + "reward": 1.9605263471603394, + "reward_std": 0.11164842545986176, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9605263471603394, + "rewards/fixed_code_pass_all_test_reward/std": 0.11164844036102295, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 325.25, + "completions/mean_terminated_length": 325.25, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.7594539752813134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.02253741107415408, + "learning_rate": 1.5603964928748685e-05, + "loss": 0.0009, + "num_tokens": 34322909.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 421.5, + "completions/mean_terminated_length": 421.5, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.7596384430916805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.018645445816218853, + "learning_rate": 1.560129789247337e-05, + "loss": 0.0007, + "num_tokens": 34330889.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.7598229109020476, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32421875, + "kl": 0.06693549687042832, + "learning_rate": 1.559863027549273e-05, + "loss": 0.0027, + "num_tokens": 34339862.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 285.25, + "completions/mean_terminated_length": 285.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.7600073787124146, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.03041716292500496, + "learning_rate": 1.559596207808334e-05, + "loss": 0.0012, + "num_tokens": 34348384.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 365.25, + "completions/mean_terminated_length": 365.25, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.7601918465227818, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.04703114437870681, + "learning_rate": 1.5593293300521814e-05, + "loss": 0.0019, + "num_tokens": 34361658.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 320.75, + "completions/mean_terminated_length": 320.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.7603763143331489, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.03171178151387721, + "learning_rate": 1.5590623943084836e-05, + "loss": 0.0013, + "num_tokens": 34368736.0, + "reward": 1.6617646217346191, + "reward_std": 0.20876337587833405, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6617647409439087, + "rewards/fixed_code_pass_all_test_reward/std": 0.20876334607601166, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 358.625, + "completions/mean_terminated_length": 358.625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.760560782143516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.04495143215171993, + "learning_rate": 1.5587954006049145e-05, + "loss": 0.0018, + "num_tokens": 34380285.0, + "reward": 1.670454502105713, + "reward_std": 0.052952997386455536, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6704545617103577, + "rewards/fixed_code_pass_all_test_reward/std": 0.05295296385884285, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 400.25, + "completions/mean_terminated_length": 400.25, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.760745249953883, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.02235506847500801, + "learning_rate": 1.5585283489691544e-05, + "loss": 0.0009, + "num_tokens": 34388119.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 333.0, + "completions/mean_terminated_length": 333.0, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7609297177642501, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.04645890276879072, + "learning_rate": 1.5582612394288892e-05, + "loss": 0.0019, + "num_tokens": 34398559.0, + "reward": 1.9272727966308594, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9272727370262146, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 228.5, + "completions/mean_terminated_length": 228.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.7611141855746172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.03720030828844756, + "learning_rate": 1.557994072011811e-05, + "loss": 0.0015, + "num_tokens": 34410123.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 321.375, + "completions/mean_terminated_length": 321.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.7612986533849844, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11962890625, + "kl": 0.022905010846443474, + "learning_rate": 1.5577268467456182e-05, + "loss": 0.0009, + "num_tokens": 34416998.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 392.875, + "completions/mean_terminated_length": 392.875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.7614831211953514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83203125, + "kl": 0.03299618302844465, + "learning_rate": 1.5574595636580156e-05, + "loss": 0.0013, + "num_tokens": 34424901.0, + "reward": 1.454545497894287, + "reward_std": 0.09718587249517441, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, + "rewards/fixed_code_pass_all_test_reward/std": 0.0971859022974968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 508.125, + "completions/mean_terminated_length": 508.125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.7616675890057185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.04822696594055742, + "learning_rate": 1.5571922227767118e-05, + "loss": 0.0019, + "num_tokens": 34434334.0, + "reward": 1.0833332538604736, + "reward_std": 0.0690065547823906, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.06900656223297119, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.7618520568160856, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.055022924207150936, + "learning_rate": 1.556924824129424e-05, + "loss": 0.0022, + "num_tokens": 34439580.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 398.875, + "completions/mean_terminated_length": 398.875, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.7620365246264527, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.058222068939357996, + "learning_rate": 1.556657367743874e-05, + "loss": 0.0023, + "num_tokens": 34453099.0, + "reward": 1.5431034564971924, + "reward_std": 0.48844295740127563, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5431034564971924, + "rewards/fixed_code_pass_all_test_reward/std": 0.48844295740127563, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 422.625, + "completions/mean_terminated_length": 422.625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.7622209924368197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.022955544875003397, + "learning_rate": 1.5563898536477902e-05, + "loss": 0.0009, + "num_tokens": 34465096.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 311.125, + "completions/mean_terminated_length": 311.125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.7624054602471869, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.04030996223445982, + "learning_rate": 1.556122281868906e-05, + "loss": 0.0016, + "num_tokens": 34473361.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 289.25, + "completions/mean_terminated_length": 289.25, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.762589928057554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90234375, + "kl": 0.03349179634824395, + "learning_rate": 1.5558546524349623e-05, + "loss": 0.0013, + "num_tokens": 34481443.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 288.875, + "completions/mean_terminated_length": 288.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7627743958679211, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8515625, + "kl": 0.058142867404967546, + "learning_rate": 1.5555869653737044e-05, + "loss": 0.0023, + "num_tokens": 34487594.0, + "reward": 1.78125, + "reward_std": 0.405046284198761, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.405046284198761, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1070.0, + "completions/max_terminated_length": 1070.0, + "completions/mean_length": 675.875, + "completions/mean_terminated_length": 675.875, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "epoch": 0.7629588636782881, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.640625, + "kl": 0.022991442354395986, + "learning_rate": 1.555319220712885e-05, + "loss": 0.0009, + "num_tokens": 34506161.0, + "reward": 1.6145833730697632, + "reward_std": 0.5097025036811829, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6145833134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.5097025036811829, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 783.125, + "completions/mean_terminated_length": 783.125, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.7631433314886552, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7109375, + "kl": 0.04164518998004496, + "learning_rate": 1.5550514184802614e-05, + "loss": 0.0017, + "num_tokens": 34522898.0, + "reward": 1.4916666746139526, + "reward_std": 0.7497618794441223, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6166666746139526, + "rewards/fixed_code_pass_all_test_reward/std": 0.5108349919319153, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 389.5, + "completions/mean_terminated_length": 389.5, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.7633277992990223, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.05158032616600394, + "learning_rate": 1.554783558703598e-05, + "loss": 0.0021, + "num_tokens": 34536174.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 260.125, + "completions/mean_terminated_length": 260.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.7635122671093895, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.0462767337448895, + "learning_rate": 1.5545156414106647e-05, + "loss": 0.0019, + "num_tokens": 34544239.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 346.875, + "completions/mean_terminated_length": 346.875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7636967349197565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.83984375, + "kl": 0.03616289282217622, + "learning_rate": 1.554247666629237e-05, + "loss": 0.0014, + "num_tokens": 34553622.0, + "reward": 1.6477272510528564, + "reward_std": 0.22498849034309387, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6477273106575012, + "rewards/fixed_code_pass_all_test_reward/std": 0.22498854994773865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 438.0, + "completions/mean_terminated_length": 438.0, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.7638812027301236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.041621406679041684, + "learning_rate": 1.553979634387097e-05, + "loss": 0.0017, + "num_tokens": 34562422.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 201.125, + "completions/mean_terminated_length": 201.125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.7640656705404907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.212890625, + "kl": 0.04784901230596006, + "learning_rate": 1.5537115447120333e-05, + "loss": 0.0019, + "num_tokens": 34566783.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 308.375, + "completions/mean_terminated_length": 308.375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7642501383508578, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.04853527818340808, + "learning_rate": 1.5534433976318383e-05, + "loss": 0.0019, + "num_tokens": 34573394.0, + "reward": 1.796875, + "reward_std": 0.3892385959625244, + "rewards/fixed_code_pass_all_test_reward/mean": 0.796875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3892386257648468, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 216.0, + "completions/mean_terminated_length": 216.0, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.7644346061612248, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.037617075140587986, + "learning_rate": 1.5531751931743122e-05, + "loss": 0.0015, + "num_tokens": 34583146.0, + "reward": 1.5, + "reward_std": 0.076360322535038, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.07636036723852158, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 218.125, + "completions/mean_terminated_length": 218.125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.764619073971592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.03491357038728893, + "learning_rate": 1.552906931367261e-05, + "loss": 0.0014, + "num_tokens": 34592347.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 340.5, + "completions/mean_terminated_length": 340.5, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7648035417819591, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.02737523231189698, + "learning_rate": 1.552638612238496e-05, + "loss": 0.0011, + "num_tokens": 34602815.0, + "reward": 1.50632905960083, + "reward_std": 0.41851532459259033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6313291192054749, + "rewards/fixed_code_pass_all_test_reward/std": 0.3943217396736145, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 419.125, + "completions/mean_terminated_length": 419.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.7649880095923262, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.03803575085476041, + "learning_rate": 1.5523702358158348e-05, + "loss": 0.0015, + "num_tokens": 34610928.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.7651724774026932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.04329600534401834, + "learning_rate": 1.552101802127101e-05, + "loss": 0.0017, + "num_tokens": 34615358.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 242.125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7653569452130603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32421875, + "kl": 0.054386631469242275, + "learning_rate": 1.5518333112001232e-05, + "loss": 0.0022, + "num_tokens": 34621119.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 1209.0, + "completions/mean_length": 767.75, + "completions/mean_terminated_length": 767.75, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.7655414130234274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.034190513426437974, + "learning_rate": 1.5515647630627382e-05, + "loss": 0.0014, + "num_tokens": 34633893.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 297.125, + "completions/mean_terminated_length": 297.125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7657258808337944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.05819693300873041, + "learning_rate": 1.5512961577427865e-05, + "loss": 0.0023, + "num_tokens": 34641790.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 338.625, + "completions/mean_terminated_length": 338.625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7659103486441616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.024057555594481528, + "learning_rate": 1.551027495268115e-05, + "loss": 0.001, + "num_tokens": 34652523.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 271.5, + "completions/mean_terminated_length": 271.5, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.7660948164545287, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.06564589589834213, + "learning_rate": 1.5507587756665775e-05, + "loss": 0.0026, + "num_tokens": 34660207.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 342.875, + "completions/mean_terminated_length": 342.875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.7662792842648958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.040337187703698874, + "learning_rate": 1.5504899989660324e-05, + "loss": 0.0016, + "num_tokens": 34667582.0, + "reward": 1.6538461446762085, + "reward_std": 0.483016699552536, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6538461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.4830167293548584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 257.375, + "completions/mean_terminated_length": 257.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.7664637520752628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.052766723558306694, + "learning_rate": 1.5502211651943454e-05, + "loss": 0.0021, + "num_tokens": 34676033.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 310.5, + "completions/mean_terminated_length": 310.5, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7666482198856299, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.04932376532815397, + "learning_rate": 1.5499522743793868e-05, + "loss": 0.002, + "num_tokens": 34685117.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.766832687695997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.04520510137081146, + "learning_rate": 1.5496833265490335e-05, + "loss": 0.0018, + "num_tokens": 34695044.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 329.5, + "completions/mean_terminated_length": 329.5, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7670171555063642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.0413076535332948, + "learning_rate": 1.5494143217311685e-05, + "loss": 0.0017, + "num_tokens": 34705952.0, + "reward": 1.9038461446762085, + "reward_std": 0.2719641625881195, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9038461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 264.625, + "completions/mean_terminated_length": 264.625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.7672016233167313, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.05754320742562413, + "learning_rate": 1.5491452599536806e-05, + "loss": 0.0023, + "num_tokens": 34715981.0, + "reward": 1.5336538553237915, + "reward_std": 0.5020825266838074, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5336538553237915, + "rewards/fixed_code_pass_all_test_reward/std": 0.5020825266838074, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 456.625, + "completions/mean_terminated_length": 456.625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.7673860911270983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.92578125, + "kl": 0.025910448050126433, + "learning_rate": 1.5488761412444638e-05, + "loss": 0.001, + "num_tokens": 34723954.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 175.375, + "completions/mean_terminated_length": 175.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.7675705589374654, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0625, + "kl": 0.040328719245735556, + "learning_rate": 1.5486069656314185e-05, + "loss": 0.0016, + "num_tokens": 34728341.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 426.0, + "completions/mean_terminated_length": 426.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7677550267478325, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0234375, + "kl": 0.03692259616218507, + "learning_rate": 1.5483377331424516e-05, + "loss": 0.0015, + "num_tokens": 34737645.0, + "reward": 1.2397959232330322, + "reward_std": 0.025582974776625633, + "rewards/fixed_code_pass_all_test_reward/mean": 0.23979592323303223, + "rewards/fixed_code_pass_all_test_reward/std": 0.025582989677786827, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 255.625, + "completions/mean_terminated_length": 255.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7679394945581995, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.03559138206765056, + "learning_rate": 1.548068443805475e-05, + "loss": 0.0014, + "num_tokens": 34747778.0, + "reward": 1.7699275016784668, + "reward_std": 0.3413332402706146, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7699275016784668, + "rewards/fixed_code_pass_all_test_reward/std": 0.341333270072937, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 117.0, + "completions/max_terminated_length": 117.0, + "completions/mean_length": 81.0, + "completions/mean_terminated_length": 81.0, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.7681239623685667, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.84375, + "kl": 0.0358170090476051, + "learning_rate": 1.5477990976484067e-05, + "loss": 0.0014, + "num_tokens": 34751106.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 268.375, + "completions/mean_terminated_length": 268.375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.7683084301789338, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.042712693102657795, + "learning_rate": 1.5475296946991712e-05, + "loss": 0.0017, + "num_tokens": 34757085.0, + "reward": 1.4310344457626343, + "reward_std": 0.04876599833369255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.43103450536727905, + "rewards/fixed_code_pass_all_test_reward/std": 0.04876597970724106, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 173.125, + "completions/mean_terminated_length": 173.125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.7684928979893009, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.09636898338794708, + "learning_rate": 1.5472602349856978e-05, + "loss": 0.0039, + "num_tokens": 34761382.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 287.125, + "completions/mean_terminated_length": 287.125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7686773657996679, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.028883957187645137, + "learning_rate": 1.5469907185359227e-05, + "loss": 0.0012, + "num_tokens": 34767767.0, + "reward": 1.9444444179534912, + "reward_std": 0.15713484585285187, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9444444179534912, + "rewards/fixed_code_pass_all_test_reward/std": 0.15713483095169067, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 379.125, + "completions/mean_terminated_length": 379.125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.768861833610035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.03723579877987504, + "learning_rate": 1.5467211453777874e-05, + "loss": 0.0015, + "num_tokens": 34777496.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 399.75, + "completions/mean_terminated_length": 399.75, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.7690463014204021, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.953125, + "kl": 0.04571031464729458, + "learning_rate": 1.5464515155392397e-05, + "loss": 0.0018, + "num_tokens": 34786038.0, + "reward": 1.0071427822113037, + "reward_std": 0.5345770120620728, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13214287161827087, + "rewards/fixed_code_pass_all_test_reward/std": 0.3507384657859802, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 198.5, + "completions/mean_terminated_length": 198.5, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.7692307692307693, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7265625, + "kl": 0.03686756337992847, + "learning_rate": 1.5461818290482323e-05, + "loss": 0.0015, + "num_tokens": 34790578.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 302.25, + "completions/mean_terminated_length": 302.25, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.7694152370411363, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.04737623827531934, + "learning_rate": 1.5459120859327255e-05, + "loss": 0.0019, + "num_tokens": 34800676.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.7695997048515034, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.02956281427759677, + "learning_rate": 1.545642286220684e-05, + "loss": 0.0012, + "num_tokens": 34805008.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 277.5, + "completions/mean_terminated_length": 277.5, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.7697841726618705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.06869720760732889, + "learning_rate": 1.5453724299400786e-05, + "loss": 0.0027, + "num_tokens": 34812972.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 259.0, + "completions/mean_terminated_length": 259.0, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.7699686404722376, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.03602675092406571, + "learning_rate": 1.5451025171188865e-05, + "loss": 0.0014, + "num_tokens": 34823508.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 357.375, + "completions/mean_terminated_length": 357.375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.7701531082826046, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1806640625, + "kl": 0.06415445217862725, + "learning_rate": 1.5448325477850907e-05, + "loss": 0.0026, + "num_tokens": 34833015.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 199.75, + "completions/mean_terminated_length": 199.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.7703375760929718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.05697881802916527, + "learning_rate": 1.5445625219666792e-05, + "loss": 0.0023, + "num_tokens": 34842373.0, + "reward": 1.5790441036224365, + "reward_std": 0.3211272060871124, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5790441036224365, + "rewards/fixed_code_pass_all_test_reward/std": 0.3211272060871124, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 324.875, + "completions/mean_terminated_length": 324.875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7705220439033389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.047597558004781604, + "learning_rate": 1.544292439691647e-05, + "loss": 0.0019, + "num_tokens": 34851748.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.770706511713706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.029382308362983167, + "learning_rate": 1.5440223009879946e-05, + "loss": 0.0012, + "num_tokens": 34855755.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 313.375, + "completions/mean_terminated_length": 313.375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.770890979524073, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.039363088784739375, + "learning_rate": 1.5437521058837273e-05, + "loss": 0.0016, + "num_tokens": 34864270.0, + "reward": 1.3125, + "reward_std": 0.7039429545402527, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.4955156147480011, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 442.875, + "completions/mean_terminated_length": 442.875, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.7710754473344401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76953125, + "kl": 0.0333313561277464, + "learning_rate": 1.543481854406858e-05, + "loss": 0.0013, + "num_tokens": 34873229.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 210.0, + "completions/mean_terminated_length": 210.0, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.7712599151448072, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.05146278068423271, + "learning_rate": 1.5432115465854042e-05, + "loss": 0.0021, + "num_tokens": 34880069.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 274.0, + "completions/mean_terminated_length": 274.0, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.7714443829551744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.03505857149139047, + "learning_rate": 1.5429411824473897e-05, + "loss": 0.0014, + "num_tokens": 34886469.0, + "reward": 1.9642857313156128, + "reward_std": 0.06612997502088547, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.06613000482320786, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 310.125, + "completions/mean_terminated_length": 310.125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7716288507655414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.03198063187301159, + "learning_rate": 1.5426707620208435e-05, + "loss": 0.0013, + "num_tokens": 34897758.0, + "reward": 1.530172348022461, + "reward_std": 0.7348925471305847, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6551724076271057, + "rewards/fixed_code_pass_all_test_reward/std": 0.4773625433444977, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 226.25, + "completions/mean_terminated_length": 226.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.7718133185759085, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.037183623760938644, + "learning_rate": 1.5424002853338022e-05, + "loss": 0.0015, + "num_tokens": 34903552.0, + "reward": 1.5208333730697632, + "reward_std": 0.4027435779571533, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5208333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.4027435779571533, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 456.375, + "completions/mean_terminated_length": 456.375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.7719977863862756, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.84375, + "kl": 0.03460608725436032, + "learning_rate": 1.5421297524143062e-05, + "loss": 0.0014, + "num_tokens": 34916003.0, + "reward": 1.48046875, + "reward_std": 0.28702935576438904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.48046875, + "rewards/fixed_code_pass_all_test_reward/std": 0.28702935576438904, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.7721822541966427, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6328125, + "kl": 0.06287805223837495, + "learning_rate": 1.5418591632904025e-05, + "loss": 0.0025, + "num_tokens": 34925267.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 173.0, + "completions/mean_terminated_length": 173.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.7723667220070097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.291015625, + "kl": 0.06577357917558402, + "learning_rate": 1.541588517990144e-05, + "loss": 0.0026, + "num_tokens": 34929491.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.7725511898173769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1357421875, + "kl": 0.06512370379641652, + "learning_rate": 1.5413178165415902e-05, + "loss": 0.0026, + "num_tokens": 34938546.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 313.375, + "completions/mean_terminated_length": 313.375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.772735657627744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.03373398771509528, + "learning_rate": 1.5410470589728043e-05, + "loss": 0.0013, + "num_tokens": 34949333.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.7729201254381111, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5703125, + "kl": 0.05231419624760747, + "learning_rate": 1.540776245311858e-05, + "loss": 0.0021, + "num_tokens": 34956625.0, + "reward": 1.9090908765792847, + "reward_std": 0.16833095252513885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9090908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.16833093762397766, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 234.0, + "completions/mean_terminated_length": 234.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.7731045932484781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.049576778430491686, + "learning_rate": 1.5405053755868262e-05, + "loss": 0.002, + "num_tokens": 34962601.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 343.75, + "completions/mean_terminated_length": 343.75, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.7732890610588452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09326171875, + "kl": 0.07637656712904572, + "learning_rate": 1.540234449825792e-05, + "loss": 0.0031, + "num_tokens": 34970607.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 508.25, + "completions/mean_terminated_length": 508.25, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.7734735288692123, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.03211791848298162, + "learning_rate": 1.5399634680568426e-05, + "loss": 0.0013, + "num_tokens": 34983273.0, + "reward": 1.375, + "reward_std": 0.6025738716125488, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.30860671401023865, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 347.375, + "completions/mean_terminated_length": 347.375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7736579966795795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.08807623595930636, + "learning_rate": 1.5396924303080715e-05, + "loss": 0.0035, + "num_tokens": 34990540.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 205.5, + "completions/mean_terminated_length": 205.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.7738424644899465, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.038086773827672005, + "learning_rate": 1.539421336607578e-05, + "loss": 0.0015, + "num_tokens": 34996040.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 120.75, + "completions/mean_terminated_length": 120.75, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.7740269323003136, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.890625, + "kl": 0.1252927128225565, + "learning_rate": 1.5391501869834677e-05, + "loss": 0.005, + "num_tokens": 34999830.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 336.25, + "completions/mean_terminated_length": 336.25, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.7742114001106807, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.04273383517283946, + "learning_rate": 1.5388789814638515e-05, + "loss": 0.0017, + "num_tokens": 35009504.0, + "reward": 1.8648648262023926, + "reward_std": 0.1444655805826187, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8648648262023926, + "rewards/fixed_code_pass_all_test_reward/std": 0.14446555078029633, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 488.0, + "completions/mean_terminated_length": 488.0, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.7743958679210478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.025885277660563588, + "learning_rate": 1.538607720076846e-05, + "loss": 0.001, + "num_tokens": 35021480.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.7745803357314148, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.03954869415611029, + "learning_rate": 1.538336402850574e-05, + "loss": 0.0016, + "num_tokens": 35027336.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 184.75, + "completions/mean_terminated_length": 184.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.774764803541782, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.05817483353894204, + "learning_rate": 1.538065029813164e-05, + "loss": 0.0023, + "num_tokens": 35031710.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 240.625, + "completions/mean_terminated_length": 240.625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.7749492713521491, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.048816447611898184, + "learning_rate": 1.5377936009927493e-05, + "loss": 0.002, + "num_tokens": 35041147.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 413.125, + "completions/mean_terminated_length": 413.125, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.7751337391625162, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201171875, + "kl": 0.03886149381287396, + "learning_rate": 1.537522116417471e-05, + "loss": 0.0016, + "num_tokens": 35049812.0, + "reward": 1.014285683631897, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.014285714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 281.5, + "completions/mean_terminated_length": 281.5, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.7753182069728832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.038354938733391464, + "learning_rate": 1.537250576115474e-05, + "loss": 0.0015, + "num_tokens": 35056448.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 94.75, + "completions/mean_terminated_length": 94.75, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.7755026747832503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.212890625, + "kl": 0.08601234992966056, + "learning_rate": 1.53697898011491e-05, + "loss": 0.0034, + "num_tokens": 35059926.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 284.75, + "completions/mean_terminated_length": 284.75, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.7756871425936174, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.060583160258829594, + "learning_rate": 1.5367073284439366e-05, + "loss": 0.0024, + "num_tokens": 35066660.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 615.5, + "completions/mean_terminated_length": 615.5, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.7758716104039846, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.02036650711670518, + "learning_rate": 1.5364356211307162e-05, + "loss": 0.0008, + "num_tokens": 35081448.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 457.875, + "completions/mean_terminated_length": 457.875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.7760560782143516, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.05927315982989967, + "learning_rate": 1.536163858203418e-05, + "loss": 0.0024, + "num_tokens": 35096855.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 375.75, + "completions/mean_terminated_length": 375.75, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7762405460247187, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.044437556294724345, + "learning_rate": 1.5358920396902167e-05, + "loss": 0.0018, + "num_tokens": 35107229.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 328.5, + "completions/mean_terminated_length": 328.5, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7764250138350858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05859375, + "kl": 0.04613933269865811, + "learning_rate": 1.5356201656192924e-05, + "loss": 0.0018, + "num_tokens": 35115585.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 222.25, + "completions/mean_terminated_length": 222.25, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.7766094816454528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7421875, + "kl": 0.10815837560221553, + "learning_rate": 1.535348236018831e-05, + "loss": 0.0043, + "num_tokens": 35122811.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 249.625, + "completions/mean_terminated_length": 249.625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.7767939494558199, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.484375, + "kl": 0.0454112661536783, + "learning_rate": 1.5350762509170253e-05, + "loss": 0.0018, + "num_tokens": 35128936.0, + "reward": 1.8958332538604736, + "reward_std": 0.294627845287323, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8958333134651184, + "rewards/fixed_code_pass_all_test_reward/std": 0.294627845287323, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 254.875, + "completions/mean_terminated_length": 254.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.7769784172661871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04150390625, + "kl": 0.031617205357179046, + "learning_rate": 1.5348042103420718e-05, + "loss": 0.0013, + "num_tokens": 35133927.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 215.875, + "completions/mean_terminated_length": 215.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.7771628850765542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.03597757639363408, + "learning_rate": 1.534532114322174e-05, + "loss": 0.0014, + "num_tokens": 35138870.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 331.375, + "completions/mean_terminated_length": 331.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.7773473528869213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060302734375, + "kl": 0.04311914648860693, + "learning_rate": 1.5342599628855415e-05, + "loss": 0.0017, + "num_tokens": 35145737.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 423.125, + "completions/mean_terminated_length": 423.125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.7775318206972883, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.08446195302531123, + "learning_rate": 1.5339877560603893e-05, + "loss": 0.0034, + "num_tokens": 35155098.0, + "reward": 1.125, + "reward_std": 0.6408699750900269, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 400.5, + "completions/mean_terminated_length": 400.5, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.7777162885076554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.02498171158367768, + "learning_rate": 1.5337154938749374e-05, + "loss": 0.001, + "num_tokens": 35163438.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.7779007563180225, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.609375, + "kl": 0.05193942366167903, + "learning_rate": 1.5334431763574122e-05, + "loss": 0.0021, + "num_tokens": 35167923.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 4217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 224.25, + "completions/mean_terminated_length": 224.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.7780852241283895, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.142578125, + "kl": 0.03880469175055623, + "learning_rate": 1.533170803536046e-05, + "loss": 0.0016, + "num_tokens": 35172893.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 254.125, + "completions/mean_terminated_length": 254.125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.7782696919387567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.345703125, + "kl": 0.07256455300375819, + "learning_rate": 1.5328983754390764e-05, + "loss": 0.0029, + "num_tokens": 35180254.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 389.25, + "completions/mean_terminated_length": 389.25, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.7784541597491238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.04087743326090276, + "learning_rate": 1.5326258920947476e-05, + "loss": 0.0016, + "num_tokens": 35188152.0, + "reward": 1.8875000476837158, + "reward_std": 0.1552647203207016, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8875000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.15526476502418518, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 359.25, + "completions/mean_terminated_length": 359.25, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7786386275594909, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.043054357171058655, + "learning_rate": 1.5323533535313076e-05, + "loss": 0.0017, + "num_tokens": 35201186.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 385.625, + "completions/mean_terminated_length": 385.625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.7788230953698579, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.050755386939272285, + "learning_rate": 1.5320807597770124e-05, + "loss": 0.002, + "num_tokens": 35211007.0, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 676.375, + "completions/mean_terminated_length": 676.375, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.779007563180225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.0350166589487344, + "learning_rate": 1.5318081108601228e-05, + "loss": 0.0014, + "num_tokens": 35225594.0, + "reward": 1.46875, + "reward_std": 0.45193037390708923, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46875, + "rewards/fixed_code_pass_all_test_reward/std": 0.4519304037094116, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 185.875, + "completions/mean_terminated_length": 185.875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.7791920309905921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.65625, + "kl": 0.08683659369125962, + "learning_rate": 1.5315354068089045e-05, + "loss": 0.0035, + "num_tokens": 35232353.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 292.625, + "completions/mean_terminated_length": 292.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7793764988009593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.04753512376919389, + "learning_rate": 1.53126264765163e-05, + "loss": 0.0019, + "num_tokens": 35242006.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 483.75, + "completions/mean_terminated_length": 483.75, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.7795609666113263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.046729941852390766, + "learning_rate": 1.5309898334165772e-05, + "loss": 0.0019, + "num_tokens": 35251084.0, + "reward": 0.9147726893424988, + "reward_std": 0.36962398886680603, + "rewards/fixed_code_pass_all_test_reward/mean": 0.039772726595401764, + "rewards/fixed_code_pass_all_test_reward/std": 0.016070609912276268, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 319.0, + "completions/mean_terminated_length": 319.0, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.7797454344216934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.953125, + "kl": 0.05022623646073043, + "learning_rate": 1.5307169641320296e-05, + "loss": 0.002, + "num_tokens": 35258148.0, + "reward": 1.8928571939468384, + "reward_std": 0.14787116646766663, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571939468384, + "rewards/fixed_code_pass_all_test_reward/std": 0.14787118136882782, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 129.5, + "completions/mean_terminated_length": 129.5, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.7799299022320605, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.53125, + "kl": 0.06938020884990692, + "learning_rate": 1.5304440398262764e-05, + "loss": 0.0028, + "num_tokens": 35262056.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 171.625, + "completions/mean_terminated_length": 171.625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.7801143700424276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048583984375, + "kl": 0.029115087701939046, + "learning_rate": 1.5301710605276127e-05, + "loss": 0.0012, + "num_tokens": 35266749.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7802988378527946, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.035598845686763525, + "learning_rate": 1.529898026264339e-05, + "loss": 0.0014, + "num_tokens": 35274119.0, + "reward": 1.959302306175232, + "reward_std": 0.03881504014134407, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9593023061752319, + "rewards/fixed_code_pass_all_test_reward/std": 0.03881501778960228, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 488.625, + "completions/mean_terminated_length": 488.625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.7804833056631618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.0507223941385746, + "learning_rate": 1.5296249370647623e-05, + "loss": 0.002, + "num_tokens": 35282956.0, + "reward": 1.9666666984558105, + "reward_std": 0.09428088366985321, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9666666984558105, + "rewards/fixed_code_pass_all_test_reward/std": 0.0942808985710144, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 931.0, + "completions/max_terminated_length": 931.0, + "completions/mean_length": 494.125, + "completions/mean_terminated_length": 494.125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7806677734735289, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.024287003790959716, + "learning_rate": 1.529351792957194e-05, + "loss": 0.001, + "num_tokens": 35295517.0, + "reward": 1.9719101190567017, + "reward_std": 0.07945017516613007, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9719101190567017, + "rewards/fixed_code_pass_all_test_reward/std": 0.07945020496845245, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 391.375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.780852241283896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.06769803026691079, + "learning_rate": 1.529078593969952e-05, + "loss": 0.0027, + "num_tokens": 35306184.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 173.625, + "completions/mean_terminated_length": 173.625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.781036709094263, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.05453792284242809, + "learning_rate": 1.5288053401313597e-05, + "loss": 0.0022, + "num_tokens": 35310453.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 344.0, + "completions/mean_terminated_length": 344.0, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7812211769046301, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.08278459077700973, + "learning_rate": 1.5285320314697465e-05, + "loss": 0.0033, + "num_tokens": 35320485.0, + "reward": 1.375, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.7814056447149972, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.039522164734080434, + "learning_rate": 1.528258668013447e-05, + "loss": 0.0016, + "num_tokens": 35334358.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 341.875, + "completions/mean_terminated_length": 341.875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.7815901125253644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.06758507061749697, + "learning_rate": 1.5279852497908018e-05, + "loss": 0.0027, + "num_tokens": 35347005.0, + "reward": 1.4166667461395264, + "reward_std": 0.21257823705673218, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.21257825195789337, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 468.625, + "completions/mean_terminated_length": 468.625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.7817745803357314, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.03683425486087799, + "learning_rate": 1.5277117768301566e-05, + "loss": 0.0015, + "num_tokens": 35359466.0, + "reward": 1.84375, + "reward_std": 0.2893187701702118, + "rewards/fixed_code_pass_all_test_reward/mean": 0.84375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2893187701702118, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 128.5, + "completions/mean_terminated_length": 128.5, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.7819590481460985, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.16712904628366232, + "learning_rate": 1.5274382491598646e-05, + "loss": 0.0067, + "num_tokens": 35363374.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 171.75, + "completions/mean_terminated_length": 171.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.7821435159564656, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.40625, + "kl": 0.07661651633679867, + "learning_rate": 1.527164666808282e-05, + "loss": 0.0031, + "num_tokens": 35372700.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 235.125, + "completions/mean_terminated_length": 235.125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.7823279837668327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.126953125, + "kl": 0.041999108041636646, + "learning_rate": 1.5268910298037724e-05, + "loss": 0.0017, + "num_tokens": 35377437.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 279.5, + "completions/mean_terminated_length": 279.5, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.7825124515771997, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.04805940203368664, + "learning_rate": 1.526617338174705e-05, + "loss": 0.0019, + "num_tokens": 35386625.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 343.375, + "completions/mean_terminated_length": 343.375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.7826969193875669, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.0276016442803666, + "learning_rate": 1.5263435919494538e-05, + "loss": 0.0011, + "num_tokens": 35393444.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 267.0, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.782881387197934, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.056785646826028824, + "learning_rate": 1.5260697911563993e-05, + "loss": 0.0023, + "num_tokens": 35401684.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 326.0, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.7830658550083011, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.058073720196262, + "learning_rate": 1.525795935823927e-05, + "loss": 0.0023, + "num_tokens": 35407796.0, + "reward": 1.90625, + "reward_std": 0.1293872892856598, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.12938730418682098, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 475.875, + "completions/mean_terminated_length": 475.875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.7832503228186681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.04560146410949528, + "learning_rate": 1.525522025980429e-05, + "loss": 0.0018, + "num_tokens": 35420091.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 362.375, + "completions/mean_terminated_length": 362.375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7834347906290352, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.08364721166435629, + "learning_rate": 1.5252480616543021e-05, + "loss": 0.0033, + "num_tokens": 35431286.0, + "reward": 1.0125000476837158, + "reward_std": 0.035355329513549805, + "rewards/fixed_code_pass_all_test_reward/mean": 0.012500000186264515, + "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 447.125, + "completions/mean_terminated_length": 447.125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.7836192584394023, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.041779750026762486, + "learning_rate": 1.5249740428739487e-05, + "loss": 0.0017, + "num_tokens": 35443807.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 189.75, + "completions/mean_terminated_length": 189.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.7838037262497695, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.109375, + "kl": 0.04858891956973821, + "learning_rate": 1.5246999696677783e-05, + "loss": 0.0019, + "num_tokens": 35448213.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 200.0, + "completions/mean_terminated_length": 200.0, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.7839881940601365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.65625, + "kl": 0.04711187840439379, + "learning_rate": 1.5244258420642041e-05, + "loss": 0.0019, + "num_tokens": 35453621.0, + "reward": 1.7999999523162842, + "reward_std": 0.12344267219305038, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.12344267219305038, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 488.125, + "completions/mean_terminated_length": 488.125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.7841726618705036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94140625, + "kl": 0.040148946922272444, + "learning_rate": 1.5241516600916462e-05, + "loss": 0.0016, + "num_tokens": 35466790.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.7843571296808707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.89453125, + "kl": 0.06099623767659068, + "learning_rate": 1.5238774237785297e-05, + "loss": 0.0024, + "num_tokens": 35472362.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 258.875, + "completions/mean_terminated_length": 258.875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.7845415974912378, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.484375, + "kl": 0.17344183754175901, + "learning_rate": 1.5236031331532857e-05, + "loss": 0.0069, + "num_tokens": 35480673.0, + "reward": 1.6749999523162842, + "reward_std": 0.20059439539909363, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6749999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.20059436559677124, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 441.625, + "completions/mean_terminated_length": 441.625, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.7847260653016048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.043307977030053735, + "learning_rate": 1.523328788244351e-05, + "loss": 0.0017, + "num_tokens": 35489318.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 215.0, + "completions/mean_terminated_length": 215.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.784910533111972, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.953125, + "kl": 0.026094762375578284, + "learning_rate": 1.5230543890801676e-05, + "loss": 0.001, + "num_tokens": 35493894.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 308.5, + "completions/mean_terminated_length": 308.5, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.7850950009223391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.054978877771645784, + "learning_rate": 1.5227799356891838e-05, + "loss": 0.0022, + "num_tokens": 35501882.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 397.875, + "completions/mean_terminated_length": 397.875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.7852794687327062, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.04716405947692692, + "learning_rate": 1.5225054280998523e-05, + "loss": 0.0019, + "num_tokens": 35512113.0, + "reward": 1.9090909957885742, + "reward_std": 0.08416546136140823, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9090909361839294, + "rewards/fixed_code_pass_all_test_reward/std": 0.08416546136140823, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 285.25, + "completions/mean_terminated_length": 285.25, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.7854639365430732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.06488224258646369, + "learning_rate": 1.5222308663406333e-05, + "loss": 0.0026, + "num_tokens": 35520475.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 217.25, + "completions/mean_terminated_length": 217.25, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.7856484043534403, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.05268768919631839, + "learning_rate": 1.5219562504399907e-05, + "loss": 0.0021, + "num_tokens": 35529621.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 194.875, + "completions/mean_terminated_length": 194.875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.7858328721638074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.05193520220927894, + "learning_rate": 1.521681580426395e-05, + "loss": 0.0021, + "num_tokens": 35536204.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 588.125, + "completions/mean_terminated_length": 588.125, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "epoch": 0.7860173399741746, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.171875, + "kl": 0.024990473873913288, + "learning_rate": 1.5214068563283223e-05, + "loss": 0.001, + "num_tokens": 35547085.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 277.25, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7862018077845416, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.15625, + "kl": 0.2598787338938564, + "learning_rate": 1.5211320781742544e-05, + "loss": 0.0104, + "num_tokens": 35554943.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 243.375, + "completions/mean_terminated_length": 243.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.7863862755949087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.09348477283492684, + "learning_rate": 1.5208572459926783e-05, + "loss": 0.0037, + "num_tokens": 35560378.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 273.5, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7865707434052758, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.040949560003355145, + "learning_rate": 1.5205823598120863e-05, + "loss": 0.0016, + "num_tokens": 35566846.0, + "reward": 1.7434210777282715, + "reward_std": 0.32740238308906555, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7434210777282715, + "rewards/fixed_code_pass_all_test_reward/std": 0.32740238308906555, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 394.75, + "completions/mean_terminated_length": 394.75, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7867552112156428, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.98828125, + "kl": 0.03829534165561199, + "learning_rate": 1.5203074196609775e-05, + "loss": 0.0015, + "num_tokens": 35577364.0, + "reward": 1.9894737005233765, + "reward_std": 0.01949092000722885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9894737005233765, + "rewards/fixed_code_pass_all_test_reward/std": 0.01949094794690609, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 407.25, + "completions/mean_terminated_length": 407.25, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.7869396790260099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.04996078601107001, + "learning_rate": 1.5200324255678553e-05, + "loss": 0.002, + "num_tokens": 35585598.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 696.5, + "completions/mean_terminated_length": 696.5, + "completions/min_length": 583.0, + "completions/min_terminated_length": 583.0, + "epoch": 0.7871241468363771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046875, + "kl": 0.02940192143432796, + "learning_rate": 1.5197573775612297e-05, + "loss": 0.0012, + "num_tokens": 35603002.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 168.125, + "completions/mean_terminated_length": 168.125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.7873086146467442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08740234375, + "kl": 0.05472077685408294, + "learning_rate": 1.5194822756696154e-05, + "loss": 0.0022, + "num_tokens": 35607331.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 452.0, + "completions/mean_terminated_length": 452.0, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.7874930824571112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.04624458588659763, + "learning_rate": 1.5192071199215334e-05, + "loss": 0.0018, + "num_tokens": 35619691.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 303.875, + "completions/mean_terminated_length": 303.875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.7876775502674783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6796875, + "kl": 0.03861832758411765, + "learning_rate": 1.5189319103455104e-05, + "loss": 0.0015, + "num_tokens": 35628794.0, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 160.75, + "completions/mean_terminated_length": 160.75, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.7878620180778454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1201171875, + "kl": 0.044964129803702235, + "learning_rate": 1.5186566469700776e-05, + "loss": 0.0018, + "num_tokens": 35633016.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 278.75, + "completions/mean_terminated_length": 278.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.7880464858882125, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.07516616676002741, + "learning_rate": 1.5183813298237727e-05, + "loss": 0.003, + "num_tokens": 35642894.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 264.125, + "completions/mean_terminated_length": 264.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7882309536985797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.031358262058347464, + "learning_rate": 1.5181059589351388e-05, + "loss": 0.0013, + "num_tokens": 35651743.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 99.0, + "completions/mean_terminated_length": 99.0, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.7884154215089467, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.06317651877179742, + "learning_rate": 1.5178305343327247e-05, + "loss": 0.0025, + "num_tokens": 35655247.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 317.875, + "completions/mean_terminated_length": 317.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7885998893193138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1416015625, + "kl": 0.028904934064485133, + "learning_rate": 1.5175550560450843e-05, + "loss": 0.0012, + "num_tokens": 35662142.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 311.375, + "completions/mean_terminated_length": 311.375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.7887843571296809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038818359375, + "kl": 0.032977010821923614, + "learning_rate": 1.5172795241007775e-05, + "loss": 0.0013, + "num_tokens": 35669281.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 361.125, + "completions/mean_terminated_length": 361.125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.7889688249400479, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1796875, + "kl": 0.04830338363535702, + "learning_rate": 1.5170039385283697e-05, + "loss": 0.0019, + "num_tokens": 35681330.0, + "reward": 1.6840277910232544, + "reward_std": 0.3826562464237213, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6840277910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.3826562762260437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 219.375, + "completions/mean_terminated_length": 219.375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.789153292750415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0576171875, + "kl": 0.025045650778338313, + "learning_rate": 1.5167282993564316e-05, + "loss": 0.001, + "num_tokens": 35686925.0, + "reward": 1.6710526943206787, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6710526347160339, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 225.625, + "completions/mean_terminated_length": 225.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.7893377605607822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.062272934243083, + "learning_rate": 1.5164526066135397e-05, + "loss": 0.0025, + "num_tokens": 35693818.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 524.125, + "completions/mean_terminated_length": 524.125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.7895222283711493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.03580970806069672, + "learning_rate": 1.516176860328276e-05, + "loss": 0.0014, + "num_tokens": 35703635.0, + "reward": 1.5625, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 107.625, + "completions/mean_terminated_length": 107.625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.7897066961815163, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0, + "kl": 0.08616695599630475, + "learning_rate": 1.5159010605292284e-05, + "loss": 0.0034, + "num_tokens": 35707304.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 364.375, + "completions/mean_terminated_length": 364.375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.7898911639918834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.06656726030632854, + "learning_rate": 1.5156252072449894e-05, + "loss": 0.0027, + "num_tokens": 35717195.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 304.125, + "completions/mean_terminated_length": 304.125, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.7900756318022505, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.0351130492053926, + "learning_rate": 1.5153493005041578e-05, + "loss": 0.0014, + "num_tokens": 35722932.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.7902600996126176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.04435131628997624, + "learning_rate": 1.5150733403353377e-05, + "loss": 0.0018, + "num_tokens": 35729509.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 440.5, + "completions/mean_terminated_length": 440.5, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.7904445674229846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.040356529178097844, + "learning_rate": 1.5147973267671394e-05, + "loss": 0.0016, + "num_tokens": 35739473.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 357.625, + "completions/mean_terminated_length": 357.625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.7906290352333518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.047102137468755245, + "learning_rate": 1.5145212598281776e-05, + "loss": 0.0019, + "num_tokens": 35750862.0, + "reward": 1.734375, + "reward_std": 0.16952534019947052, + "rewards/fixed_code_pass_all_test_reward/mean": 0.734375, + "rewards/fixed_code_pass_all_test_reward/std": 0.16952534019947052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 317.625, + "completions/mean_terminated_length": 317.625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.7908135030437189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0712890625, + "kl": 0.03653145511634648, + "learning_rate": 1.514245139547073e-05, + "loss": 0.0015, + "num_tokens": 35756747.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 341.25, + "completions/mean_terminated_length": 341.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.790997970854086, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.015574604272842407, + "learning_rate": 1.5139689659524522e-05, + "loss": 0.0006, + "num_tokens": 35763821.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 513.875, + "completions/mean_terminated_length": 513.875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.791182438664453, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.03578599519096315, + "learning_rate": 1.513692739072947e-05, + "loss": 0.0014, + "num_tokens": 35773532.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 351.125, + "completions/mean_terminated_length": 351.125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.7913669064748201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1591796875, + "kl": 0.06496074004098773, + "learning_rate": 1.5134164589371947e-05, + "loss": 0.0026, + "num_tokens": 35783285.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 531.5, + "completions/mean_terminated_length": 531.5, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.7915513742851872, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.04257419263012707, + "learning_rate": 1.5131401255738381e-05, + "loss": 0.0017, + "num_tokens": 35797921.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 300.875, + "completions/mean_terminated_length": 300.875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7917358420955544, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.0576136929448694, + "learning_rate": 1.5128637390115256e-05, + "loss": 0.0023, + "num_tokens": 35804552.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 169.375, + "completions/mean_terminated_length": 169.375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.7919203099059214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.037702382542192936, + "learning_rate": 1.5125872992789119e-05, + "loss": 0.0015, + "num_tokens": 35808603.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 274.625, + "completions/mean_terminated_length": 274.625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.7921047777162885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0306396484375, + "kl": 0.017003939487040043, + "learning_rate": 1.5123108064046554e-05, + "loss": 0.0007, + "num_tokens": 35814520.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 611.75, + "completions/mean_terminated_length": 611.75, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.7922892455266556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.036021529231220484, + "learning_rate": 1.5120342604174213e-05, + "loss": 0.0014, + "num_tokens": 35824774.0, + "reward": 1.326923131942749, + "reward_std": 0.5762816667556763, + "rewards/fixed_code_pass_all_test_reward/mean": 0.45192307233810425, + "rewards/fixed_code_pass_all_test_reward/std": 0.27924850583076477, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 243.75, + "completions/mean_terminated_length": 243.75, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.7924737133370227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.94921875, + "kl": 0.037810812471434474, + "learning_rate": 1.5117576613458803e-05, + "loss": 0.0015, + "num_tokens": 35833844.0, + "reward": 1.5, + "reward_std": 0.9258201122283936, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 634.25, + "completions/mean_terminated_length": 634.25, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "epoch": 0.7926581811473897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.82421875, + "kl": 0.023020552471280098, + "learning_rate": 1.5114810092187082e-05, + "loss": 0.0009, + "num_tokens": 35852398.0, + "reward": 1.8020833730697632, + "reward_std": 0.3240906596183777, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8020833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.3240906298160553, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7928426489577569, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.109375, + "kl": 0.02639222051948309, + "learning_rate": 1.5112043040645863e-05, + "loss": 0.0011, + "num_tokens": 35858077.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 143.125, + "completions/mean_terminated_length": 143.125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.793027116768124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.07339434139430523, + "learning_rate": 1.510927545912202e-05, + "loss": 0.0029, + "num_tokens": 35861942.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 187.0, + "completions/mean_terminated_length": 187.0, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.7932115845784911, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.28125, + "kl": 0.2752287737093866, + "learning_rate": 1.5106507347902475e-05, + "loss": 0.011, + "num_tokens": 35866206.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 253.75, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.7933960523888581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035400390625, + "kl": 0.016725976718589664, + "learning_rate": 1.5103738707274205e-05, + "loss": 0.0007, + "num_tokens": 35871500.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 259.5, + "completions/mean_terminated_length": 259.5, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7935805201992252, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.018712026649154723, + "learning_rate": 1.5100969537524245e-05, + "loss": 0.0007, + "num_tokens": 35876728.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7937649880095923, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.057299571577459574, + "learning_rate": 1.5098199838939684e-05, + "loss": 0.0023, + "num_tokens": 35885653.0, + "reward": 1.5749999284744263, + "reward_std": 0.6363961100578308, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7000000476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 627.5, + "completions/mean_terminated_length": 627.5, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.7939494558199595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.027587792137637734, + "learning_rate": 1.5095429611807673e-05, + "loss": 0.0011, + "num_tokens": 35899769.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 323.5, + "completions/mean_terminated_length": 323.5, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7941339236303265, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.02641181240323931, + "learning_rate": 1.5092658856415403e-05, + "loss": 0.0011, + "num_tokens": 35910325.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 265.25, + "completions/mean_terminated_length": 265.25, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.7943183914406936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.049518843181431293, + "learning_rate": 1.5089887573050129e-05, + "loss": 0.002, + "num_tokens": 35919127.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 596.875, + "completions/mean_terminated_length": 596.875, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "epoch": 0.7945028592510607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.03633172903209925, + "learning_rate": 1.5087115761999158e-05, + "loss": 0.0015, + "num_tokens": 35930350.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.7946873270614278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09814453125, + "kl": 0.047069798689335585, + "learning_rate": 1.5084343423549857e-05, + "loss": 0.0019, + "num_tokens": 35938413.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.7948717948717948, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07177734375, + "kl": 0.03980321902781725, + "learning_rate": 1.508157055798964e-05, + "loss": 0.0016, + "num_tokens": 35948479.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 253.125, + "completions/mean_terminated_length": 253.125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.795056262682162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.07086195051670074, + "learning_rate": 1.507879716560598e-05, + "loss": 0.0028, + "num_tokens": 35956128.0, + "reward": 1.75, + "reward_std": 0.26726123690605164, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.26726123690605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 287.25, + "completions/mean_terminated_length": 287.25, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.7952407304925291, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.040794012136757374, + "learning_rate": 1.5076023246686407e-05, + "loss": 0.0016, + "num_tokens": 35966354.0, + "reward": 1.7922296524047852, + "reward_std": 0.3080655038356781, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7922297716140747, + "rewards/fixed_code_pass_all_test_reward/std": 0.3080655634403229, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 188.375, + "completions/mean_terminated_length": 188.375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.7954251983028962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.052502304781228304, + "learning_rate": 1.5073248801518499e-05, + "loss": 0.0021, + "num_tokens": 35974141.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 321.25, + "completions/mean_terminated_length": 321.25, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.7956096661132632, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.04905204311944544, + "learning_rate": 1.5070473830389892e-05, + "loss": 0.002, + "num_tokens": 35983167.0, + "reward": 1.625, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 418.625, + "completions/mean_terminated_length": 418.625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7957941339236303, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.177734375, + "kl": 0.07289852318353951, + "learning_rate": 1.5067698333588276e-05, + "loss": 0.0029, + "num_tokens": 35991004.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 288.375, + "completions/mean_terminated_length": 288.375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7959786017339974, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.05602266942150891, + "learning_rate": 1.5064922311401403e-05, + "loss": 0.0022, + "num_tokens": 36001719.0, + "reward": 1.7777776718139648, + "reward_std": 0.20573778450489044, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7777777910232544, + "rewards/fixed_code_pass_all_test_reward/std": 0.20573779940605164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 262.625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.7961630695443646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.02790191792882979, + "learning_rate": 1.5062145764117064e-05, + "loss": 0.0011, + "num_tokens": 36010948.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 288.125, + "completions/mean_terminated_length": 288.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7963475373547316, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.030230681411921978, + "learning_rate": 1.5059368692023114e-05, + "loss": 0.0012, + "num_tokens": 36017717.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 318.75, + "completions/mean_terminated_length": 318.75, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.7965320051650987, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.033905977848917246, + "learning_rate": 1.5056591095407465e-05, + "loss": 0.0014, + "num_tokens": 36024651.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 289.75, + "completions/mean_terminated_length": 289.75, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.7967164729754658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1494140625, + "kl": 0.06293100956827402, + "learning_rate": 1.5053812974558078e-05, + "loss": 0.0025, + "num_tokens": 36034345.0, + "reward": 1.0833333730697632, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 323.75, + "completions/mean_terminated_length": 323.75, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.7969009407858328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.51171875, + "kl": 0.08539120806381106, + "learning_rate": 1.5051034329762972e-05, + "loss": 0.0034, + "num_tokens": 36043839.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 422.0, + "completions/mean_terminated_length": 422.0, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.7970854085961999, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03186343004927039, + "learning_rate": 1.5048255161310215e-05, + "loss": 0.0013, + "num_tokens": 36056591.0, + "reward": 1.34375, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 183.5, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.7972698764065671, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.037303369492292404, + "learning_rate": 1.5045475469487932e-05, + "loss": 0.0015, + "num_tokens": 36060827.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 213.125, + "completions/mean_terminated_length": 213.125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.7974543442169342, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.03575144917704165, + "learning_rate": 1.5042695254584308e-05, + "loss": 0.0014, + "num_tokens": 36067300.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.7976388120273012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056884765625, + "kl": 0.043438287219032645, + "learning_rate": 1.5039914516887575e-05, + "loss": 0.0017, + "num_tokens": 36075195.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 439.25, + "completions/mean_terminated_length": 439.25, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.7978232798376683, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.060549080139026046, + "learning_rate": 1.5037133256686019e-05, + "loss": 0.0024, + "num_tokens": 36086629.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 682.25, + "completions/mean_terminated_length": 682.25, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "epoch": 0.7980077476480354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.021951534552499652, + "learning_rate": 1.5034351474267985e-05, + "loss": 0.0009, + "num_tokens": 36102327.0, + "reward": 1.990384578704834, + "reward_std": 0.027196446433663368, + "rewards/fixed_code_pass_all_test_reward/mean": 0.990384578704834, + "rewards/fixed_code_pass_all_test_reward/std": 0.027196412906050682, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.7981922154584025, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.03357413015328348, + "learning_rate": 1.5031569169921869e-05, + "loss": 0.0013, + "num_tokens": 36109365.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 251.125, + "completions/mean_terminated_length": 251.125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.7983766832687696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.07525891670957208, + "learning_rate": 1.5028786343936123e-05, + "loss": 0.003, + "num_tokens": 36119326.0, + "reward": 1.5892857313156128, + "reward_std": 0.0991949662566185, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5892857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.09919500350952148, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 118.0, + "completions/max_terminated_length": 118.0, + "completions/mean_length": 95.25, + "completions/mean_terminated_length": 95.25, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.7985611510791367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11279296875, + "kl": 0.030775656923651695, + "learning_rate": 1.5026002996599252e-05, + "loss": 0.0012, + "num_tokens": 36122800.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 240.25, + "completions/mean_terminated_length": 240.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.7987456188895038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.057522712741047144, + "learning_rate": 1.5023219128199813e-05, + "loss": 0.0023, + "num_tokens": 36132474.0, + "reward": 1.7083333730697632, + "reward_std": 0.4520675241947174, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.4520675837993622, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 371.5, + "completions/mean_terminated_length": 371.5, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7989300866998709, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.05647197691723704, + "learning_rate": 1.502043473902642e-05, + "loss": 0.0023, + "num_tokens": 36139758.0, + "reward": 1.8643617630004883, + "reward_std": 0.11254923790693283, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8643617033958435, + "rewards/fixed_code_pass_all_test_reward/std": 0.11254924535751343, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 303.0, + "completions/mean_terminated_length": 303.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7991145545102379, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0517578125, + "kl": 0.048513495828956366, + "learning_rate": 1.5017649829367737e-05, + "loss": 0.0019, + "num_tokens": 36152230.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 470.25, + "completions/mean_terminated_length": 470.25, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.799299022320605, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.032952550100162625, + "learning_rate": 1.5014864399512487e-05, + "loss": 0.0013, + "num_tokens": 36160840.0, + "reward": 1.8214285373687744, + "reward_std": 0.3393528461456299, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9464285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.07393559068441391, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 283.0, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.7994834901309722, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.039512885734438896, + "learning_rate": 1.501207844974945e-05, + "loss": 0.0016, + "num_tokens": 36169112.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 435.125, + "completions/mean_terminated_length": 435.125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.7996679579413393, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.033033008221536875, + "learning_rate": 1.5009291980367449e-05, + "loss": 0.0013, + "num_tokens": 36180657.0, + "reward": 1.9514925479888916, + "reward_std": 0.0764177218079567, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9514925479888916, + "rewards/fixed_code_pass_all_test_reward/std": 0.07641774415969849, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 162.25, + "completions/mean_terminated_length": 162.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.7998524257517063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.03638588031753898, + "learning_rate": 1.5006504991655367e-05, + "loss": 0.0015, + "num_tokens": 36184875.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 450.625, + "completions/mean_terminated_length": 450.625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.8000368935620734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.03963064565323293, + "learning_rate": 1.500371748390214e-05, + "loss": 0.0016, + "num_tokens": 36196160.0, + "reward": 1.0475351810455322, + "reward_std": 0.5257523655891418, + "rewards/fixed_code_pass_all_test_reward/mean": 0.17253521084785461, + "rewards/fixed_code_pass_all_test_reward/std": 0.31956180930137634, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 371.625, + "completions/mean_terminated_length": 371.625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.8002213613724405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.03471875935792923, + "learning_rate": 1.500092945739676e-05, + "loss": 0.0014, + "num_tokens": 36202701.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 133.625, + "completions/mean_terminated_length": 133.625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.8004058291828076, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.1129117519594729, + "learning_rate": 1.4998140912428274e-05, + "loss": 0.0045, + "num_tokens": 36206450.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 177.875, + "completions/mean_terminated_length": 177.875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.8005902969931747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046875, + "kl": 0.03067458188161254, + "learning_rate": 1.4995351849285773e-05, + "loss": 0.0012, + "num_tokens": 36210801.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 414.875, + "completions/mean_terminated_length": 414.875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.8007747648035418, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.039575346279889345, + "learning_rate": 1.4992562268258413e-05, + "loss": 0.0016, + "num_tokens": 36222480.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 433.0, + "completions/mean_terminated_length": 433.0, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8009592326139089, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.044190880842506886, + "learning_rate": 1.4989772169635397e-05, + "loss": 0.0018, + "num_tokens": 36231944.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.801143700424276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044677734375, + "kl": 0.017380940495058894, + "learning_rate": 1.4986981553705985e-05, + "loss": 0.0007, + "num_tokens": 36236511.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 459.0, + "completions/mean_terminated_length": 459.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.801328168234643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66796875, + "kl": 0.014636554347816855, + "learning_rate": 1.4984190420759492e-05, + "loss": 0.0006, + "num_tokens": 36245311.0, + "reward": 1.2125000953674316, + "reward_std": 0.035355307161808014, + "rewards/fixed_code_pass_all_test_reward/mean": 0.21250000596046448, + "rewards/fixed_code_pass_all_test_reward/std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 250.25, + "completions/mean_terminated_length": 250.25, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.8015126360450101, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.041719620348885655, + "learning_rate": 1.4981398771085278e-05, + "loss": 0.0017, + "num_tokens": 36251129.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 355.125, + "completions/mean_terminated_length": 355.125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.8016971038553773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046875, + "kl": 0.02718952053692192, + "learning_rate": 1.4978606604972768e-05, + "loss": 0.0011, + "num_tokens": 36261226.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 310.25, + "completions/mean_terminated_length": 310.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8018815716657444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.05092989862896502, + "learning_rate": 1.4975813922711435e-05, + "loss": 0.002, + "num_tokens": 36271172.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 354.0, + "completions/mean_terminated_length": 354.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.8020660394761114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.04369087144732475, + "learning_rate": 1.4973020724590803e-05, + "loss": 0.0017, + "num_tokens": 36278156.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 387.125, + "completions/mean_terminated_length": 387.125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.8022505072864785, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.055120388278737664, + "learning_rate": 1.4970227010900453e-05, + "loss": 0.0022, + "num_tokens": 36288501.0, + "reward": 1.4821428060531616, + "reward_std": 0.5460566282272339, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6071428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.3253166079521179, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 131.125, + "completions/mean_terminated_length": 131.125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.8024349750968456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.03757225477602333, + "learning_rate": 1.496743278193002e-05, + "loss": 0.0015, + "num_tokens": 36292262.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.8026194429072127, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.023363763699308038, + "learning_rate": 1.496463803796919e-05, + "loss": 0.0009, + "num_tokens": 36298492.0, + "reward": 1.94140625, + "reward_std": 0.1657281517982483, + "rewards/fixed_code_pass_all_test_reward/mean": 0.94140625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1657281517982483, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 372.375, + "completions/mean_terminated_length": 372.375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.8028039107175797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039306640625, + "kl": 0.02221129834651947, + "learning_rate": 1.4961842779307702e-05, + "loss": 0.0009, + "num_tokens": 36308911.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 296.625, + "completions/mean_terminated_length": 296.625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.8029883785279469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08935546875, + "kl": 0.04054278717376292, + "learning_rate": 1.4959047006235352e-05, + "loss": 0.0016, + "num_tokens": 36314516.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 458.25, + "completions/mean_terminated_length": 458.25, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.803172846338314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.030884688487276435, + "learning_rate": 1.4956250719041987e-05, + "loss": 0.0012, + "num_tokens": 36322238.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 389.25, + "completions/mean_terminated_length": 389.25, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.8033573141486811, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.039740476524457335, + "learning_rate": 1.4953453918017512e-05, + "loss": 0.0016, + "num_tokens": 36331136.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 299.875, + "completions/mean_terminated_length": 299.875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8035417819590481, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.034326078137382865, + "learning_rate": 1.4950656603451868e-05, + "loss": 0.0014, + "num_tokens": 36337959.0, + "reward": 1.3020832538604736, + "reward_std": 0.4541202187538147, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3020833432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.4541202485561371, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 296.25, + "completions/mean_terminated_length": 296.25, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8037262497694152, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.12260240619070828, + "learning_rate": 1.4947858775635073e-05, + "loss": 0.0049, + "num_tokens": 36348257.0, + "reward": 1.8192567825317383, + "reward_std": 0.29588446021080017, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8192567825317383, + "rewards/fixed_code_pass_all_test_reward/std": 0.29588449001312256, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 438.375, + "completions/mean_terminated_length": 438.375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.8039107175797823, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.04997399100102484, + "learning_rate": 1.4945060434857185e-05, + "loss": 0.002, + "num_tokens": 36364260.0, + "reward": 1.78125, + "reward_std": 0.41052013635635376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.41052016615867615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.8040951853901495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.032463502953760326, + "learning_rate": 1.4942261581408315e-05, + "loss": 0.0013, + "num_tokens": 36370042.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 219.125, + "completions/mean_terminated_length": 219.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.8042796532005165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.02374658768530935, + "learning_rate": 1.493946221557863e-05, + "loss": 0.0009, + "num_tokens": 36375059.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 292.375, + "completions/mean_terminated_length": 292.375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8044641210108836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.05420694872736931, + "learning_rate": 1.4936662337658353e-05, + "loss": 0.0022, + "num_tokens": 36383654.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 379.5, + "completions/mean_terminated_length": 379.5, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.8046485888212507, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.03056553890928626, + "learning_rate": 1.4933861947937754e-05, + "loss": 0.0012, + "num_tokens": 36393146.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 427.25, + "completions/mean_terminated_length": 427.25, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.8048330566316177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.04093357943929732, + "learning_rate": 1.4931061046707159e-05, + "loss": 0.0016, + "num_tokens": 36403284.0, + "reward": 1.528735637664795, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5287356376647949, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 348.375, + "completions/mean_terminated_length": 348.375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.8050175244419848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.96875, + "kl": 0.13578288117423654, + "learning_rate": 1.4928259634256943e-05, + "loss": 0.0054, + "num_tokens": 36410359.0, + "reward": 1.6414835453033447, + "reward_std": 0.7222461700439453, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7664835453033447, + "rewards/fixed_code_pass_all_test_reward/std": 0.4214785695075989, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 147.0, + "completions/max_terminated_length": 147.0, + "completions/mean_length": 121.625, + "completions/mean_terminated_length": 121.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.805201992252352, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.052997968159615993, + "learning_rate": 1.4925457710877545e-05, + "loss": 0.0021, + "num_tokens": 36414148.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 305.125, + "completions/mean_terminated_length": 305.125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.8053864600627191, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.078125, + "kl": 0.06247329921461642, + "learning_rate": 1.4922655276859446e-05, + "loss": 0.0025, + "num_tokens": 36422069.0, + "reward": 1.625, + "reward_std": 0.4432026147842407, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 178.25, + "completions/mean_terminated_length": 178.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.8055709278730862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.03539038309827447, + "learning_rate": 1.4919852332493183e-05, + "loss": 0.0014, + "num_tokens": 36426359.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 362.75, + "completions/mean_terminated_length": 362.75, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8057553956834532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.024879404110834002, + "learning_rate": 1.4917048878069348e-05, + "loss": 0.001, + "num_tokens": 36436557.0, + "reward": 1.7048193216323853, + "reward_std": 0.12421075254678726, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7048192620277405, + "rewards/fixed_code_pass_all_test_reward/std": 0.12421079725027084, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 425.5, + "completions/mean_terminated_length": 425.5, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8059398634938203, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.06623038137331605, + "learning_rate": 1.4914244913878584e-05, + "loss": 0.0026, + "num_tokens": 36444113.0, + "reward": 1.4166666269302368, + "reward_std": 0.45730987191200256, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666567325592, + "rewards/fixed_code_pass_all_test_reward/std": 0.45730993151664734, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 788.125, + "completions/mean_terminated_length": 788.125, + "completions/min_length": 723.0, + "completions/min_terminated_length": 723.0, + "epoch": 0.8061243313041874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76171875, + "kl": 0.026985066826455295, + "learning_rate": 1.4911440440211592e-05, + "loss": 0.0011, + "num_tokens": 36457578.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 235.0, + "completions/mean_terminated_length": 235.0, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.8063087991145546, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11474609375, + "kl": 0.049860212951898575, + "learning_rate": 1.4908635457359112e-05, + "loss": 0.002, + "num_tokens": 36463418.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 332.5, + "completions/mean_terminated_length": 332.5, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.8064932669249216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033203125, + "kl": 0.018359996378421783, + "learning_rate": 1.490582996561195e-05, + "loss": 0.0007, + "num_tokens": 36471894.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 236.375, + "completions/mean_terminated_length": 236.375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.8066777347352887, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.03707440476864576, + "learning_rate": 1.4903023965260963e-05, + "loss": 0.0015, + "num_tokens": 36477113.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 195.375, + "completions/mean_terminated_length": 195.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.8068622025456558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07470703125, + "kl": 0.029892700724303722, + "learning_rate": 1.4900217456597059e-05, + "loss": 0.0012, + "num_tokens": 36481524.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 327.375, + "completions/mean_terminated_length": 327.375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.8070466703560228, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.05107171507552266, + "learning_rate": 1.4897410439911192e-05, + "loss": 0.002, + "num_tokens": 36491487.0, + "reward": 1.8785715103149414, + "reward_std": 0.3434518277645111, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8785714507102966, + "rewards/fixed_code_pass_all_test_reward/std": 0.3434518575668335, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 656.0, + "completions/mean_terminated_length": 656.0, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "epoch": 0.8072311381663899, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76953125, + "kl": 0.03920824360102415, + "learning_rate": 1.4894602915494382e-05, + "loss": 0.0016, + "num_tokens": 36503759.0, + "reward": 1.1416666507720947, + "reward_std": 0.1433720737695694, + "rewards/fixed_code_pass_all_test_reward/mean": 0.14166668057441711, + "rewards/fixed_code_pass_all_test_reward/std": 0.14337210357189178, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 262.875, + "completions/mean_terminated_length": 262.875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.8074156059767571, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.054348713252693415, + "learning_rate": 1.4891794883637692e-05, + "loss": 0.0022, + "num_tokens": 36512334.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 445.0, + "completions/mean_terminated_length": 445.0, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.8076000737871242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.025964933389332145, + "learning_rate": 1.4888986344632239e-05, + "loss": 0.001, + "num_tokens": 36521030.0, + "reward": 1.9642857313156128, + "reward_std": 0.10101523250341415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 144.125, + "completions/mean_terminated_length": 144.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.8077845415974912, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.09122531954199076, + "learning_rate": 1.4886177298769192e-05, + "loss": 0.0036, + "num_tokens": 36524919.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 375.875, + "completions/mean_terminated_length": 375.875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.8079690094078583, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.05263061635196209, + "learning_rate": 1.4883367746339778e-05, + "loss": 0.0021, + "num_tokens": 36532342.0, + "reward": 1.9076087474822998, + "reward_std": 0.13030529022216797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9076087474822998, + "rewards/fixed_code_pass_all_test_reward/std": 0.13030532002449036, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 312.5, + "completions/mean_terminated_length": 312.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.8081534772182254, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.031326914206147194, + "learning_rate": 1.4880557687635269e-05, + "loss": 0.0013, + "num_tokens": 36538970.0, + "reward": 1.9038461446762085, + "reward_std": 0.2719641625881195, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9038461446762085, + "rewards/fixed_code_pass_all_test_reward/std": 0.2719641625881195, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 234.375, + "completions/mean_terminated_length": 234.375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.8083379450285925, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.0400226708734408, + "learning_rate": 1.4877747122947e-05, + "loss": 0.0016, + "num_tokens": 36543813.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 233.5, + "completions/mean_terminated_length": 233.5, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.8085224128389596, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.047981366980820894, + "learning_rate": 1.487493605256634e-05, + "loss": 0.0019, + "num_tokens": 36548865.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 246.75, + "completions/mean_terminated_length": 246.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.8087068806493267, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.04247316438704729, + "learning_rate": 1.4872124476784734e-05, + "loss": 0.0017, + "num_tokens": 36556607.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 292.125, + "completions/mean_terminated_length": 292.125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.8088913484596938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.06693923333659768, + "learning_rate": 1.486931239589366e-05, + "loss": 0.0027, + "num_tokens": 36565520.0, + "reward": 1.1111111640930176, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1111111119389534, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 187.125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.8090758162700609, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.02382931101601571, + "learning_rate": 1.4866499810184662e-05, + "loss": 0.001, + "num_tokens": 36569705.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 350.625, + "completions/mean_terminated_length": 350.625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.8092602840804279, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045166015625, + "kl": 0.045349675696343184, + "learning_rate": 1.4863686719949321e-05, + "loss": 0.0018, + "num_tokens": 36576966.0, + "reward": 1.5384615659713745, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5384615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 367.0, + "completions/mean_terminated_length": 367.0, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.809444751890795, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12353515625, + "kl": 0.04114903451409191, + "learning_rate": 1.486087312547929e-05, + "loss": 0.0016, + "num_tokens": 36586294.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 415.25, + "completions/mean_terminated_length": 415.25, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.8096292197011622, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9140625, + "kl": 0.023470085579901934, + "learning_rate": 1.4858059027066256e-05, + "loss": 0.0009, + "num_tokens": 36595032.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 435.125, + "completions/mean_terminated_length": 435.125, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.8098136875115293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.0530205462127924, + "learning_rate": 1.485524442500197e-05, + "loss": 0.0021, + "num_tokens": 36607097.0, + "reward": 1.5333333015441895, + "reward_std": 0.41975051164627075, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5333333015441895, + "rewards/fixed_code_pass_all_test_reward/std": 0.41975051164627075, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 338.0, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.8099981553218963, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.07789242453873158, + "learning_rate": 1.485242931957823e-05, + "loss": 0.0031, + "num_tokens": 36618089.0, + "reward": 1.7083333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.11785111576318741, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 381.625, + "completions/mean_terminated_length": 381.625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.8101826231322634, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.05858763330616057, + "learning_rate": 1.4849613711086885e-05, + "loss": 0.0023, + "num_tokens": 36625870.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.8103670909426305, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.016684241592884064, + "learning_rate": 1.4846797599819844e-05, + "loss": 0.0007, + "num_tokens": 36631005.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 628.0, + "completions/mean_terminated_length": 628.0, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.8105515587529976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6875, + "kl": 0.03264805069193244, + "learning_rate": 1.4843980986069058e-05, + "loss": 0.0013, + "num_tokens": 36645437.0, + "reward": 1.894230842590332, + "reward_std": 0.29916054010391235, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8942307829856873, + "rewards/fixed_code_pass_all_test_reward/std": 0.29916056990623474, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 389.125, + "completions/mean_terminated_length": 389.125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8107360265633647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033203125, + "kl": 0.015926459687761962, + "learning_rate": 1.4841163870126533e-05, + "loss": 0.0006, + "num_tokens": 36652990.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 367.25, + "completions/mean_terminated_length": 367.25, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.8109204943737318, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.03767395415343344, + "learning_rate": 1.4838346252284337e-05, + "loss": 0.0015, + "num_tokens": 36660320.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 487.875, + "completions/mean_terminated_length": 487.875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.8111049621840989, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.63671875, + "kl": 0.024379739305004478, + "learning_rate": 1.4835528132834579e-05, + "loss": 0.001, + "num_tokens": 36671607.0, + "reward": 1.2467105388641357, + "reward_std": 0.25120896100997925, + "rewards/fixed_code_pass_all_test_reward/mean": 0.24671052396297455, + "rewards/fixed_code_pass_all_test_reward/std": 0.25120899081230164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 241.25, + "completions/mean_terminated_length": 241.25, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.811289429994466, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.859375, + "kl": 0.06366124947089702, + "learning_rate": 1.4832709512069414e-05, + "loss": 0.0025, + "num_tokens": 36679961.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 503.25, + "completions/mean_terminated_length": 503.25, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.811473897804833, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.038279904052615166, + "learning_rate": 1.482989039028107e-05, + "loss": 0.0015, + "num_tokens": 36690275.0, + "reward": 1.6666667461395264, + "reward_std": 0.17817415297031403, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.17817415297031403, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 526.875, + "completions/mean_terminated_length": 526.875, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.8116583656152001, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.03175914043094963, + "learning_rate": 1.4827070767761806e-05, + "loss": 0.0013, + "num_tokens": 36703306.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 464.625, + "completions/mean_terminated_length": 464.625, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.8118428334255673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2177734375, + "kl": 0.026326433988288045, + "learning_rate": 1.4824250644803951e-05, + "loss": 0.0011, + "num_tokens": 36712399.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 410.0, + "completions/mean_terminated_length": 410.0, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.8120273012359344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.03551439684815705, + "learning_rate": 1.4821430021699865e-05, + "loss": 0.0014, + "num_tokens": 36724071.0, + "reward": 1.8858695030212402, + "reward_std": 0.3228096067905426, + "rewards/fixed_code_pass_all_test_reward/mean": 0.885869562625885, + "rewards/fixed_code_pass_all_test_reward/std": 0.322809636592865, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 572.5, + "completions/mean_terminated_length": 361.71429443359375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.8122117690463014, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.03990285762120038, + "learning_rate": 1.4818608898741982e-05, + "loss": 0.0016, + "num_tokens": 36733731.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 585.625, + "completions/mean_terminated_length": 585.625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.8123962368566685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.055546588730067015, + "learning_rate": 1.4815787276222768e-05, + "loss": 0.0022, + "num_tokens": 36749432.0, + "reward": 1.6796875, + "reward_std": 0.7066627144813538, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8046875, + "rewards/fixed_code_pass_all_test_reward/std": 0.380080908536911, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 260.125, + "completions/mean_terminated_length": 260.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.8125807046670356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.04892460582777858, + "learning_rate": 1.481296515443476e-05, + "loss": 0.002, + "num_tokens": 36755513.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 265.875, + "completions/mean_terminated_length": 265.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.8127651724774027, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.06480020051822066, + "learning_rate": 1.4810142533670526e-05, + "loss": 0.0026, + "num_tokens": 36760880.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 423.5, + "completions/mean_terminated_length": 423.5, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.8129496402877698, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.062073503620922565, + "learning_rate": 1.4807319414222708e-05, + "loss": 0.0025, + "num_tokens": 36771420.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 386.875, + "completions/mean_terminated_length": 386.875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.8131341080981369, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.11369953537359834, + "learning_rate": 1.4804495796383977e-05, + "loss": 0.0045, + "num_tokens": 36782267.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 169.125, + "completions/mean_terminated_length": 169.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.813318575908504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.0514680533669889, + "learning_rate": 1.4801671680447079e-05, + "loss": 0.0021, + "num_tokens": 36786428.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 396.5, + "completions/mean_terminated_length": 396.5, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.813503043718871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.087890625, + "kl": 0.04786990396678448, + "learning_rate": 1.4798847066704785e-05, + "loss": 0.0019, + "num_tokens": 36797000.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 289.75, + "completions/mean_terminated_length": 289.75, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8136875115292381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06787109375, + "kl": 0.04091847641393542, + "learning_rate": 1.4796021955449943e-05, + "loss": 0.0016, + "num_tokens": 36805182.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 274.75, + "completions/mean_terminated_length": 274.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.8138719793396052, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.04209016659297049, + "learning_rate": 1.479319634697544e-05, + "loss": 0.0017, + "num_tokens": 36814164.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 159.25, + "completions/mean_terminated_length": 159.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8140564471499724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.03043151763267815, + "learning_rate": 1.4790370241574214e-05, + "loss": 0.0012, + "num_tokens": 36818150.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 283.75, + "completions/mean_terminated_length": 283.75, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8142409149603395, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1171875, + "kl": 0.04518334660679102, + "learning_rate": 1.4787543639539257e-05, + "loss": 0.0018, + "num_tokens": 36826428.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 236.125, + "completions/mean_terminated_length": 236.125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.8144253827707065, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048095703125, + "kl": 0.0336305710952729, + "learning_rate": 1.4784716541163615e-05, + "loss": 0.0013, + "num_tokens": 36834669.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 291.375, + "completions/mean_terminated_length": 291.375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.8146098505810736, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.039790721610188484, + "learning_rate": 1.478188894674038e-05, + "loss": 0.0016, + "num_tokens": 36843576.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 300.125, + "completions/mean_terminated_length": 300.125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.8147943183914407, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.0753085152246058, + "learning_rate": 1.47790608565627e-05, + "loss": 0.003, + "num_tokens": 36850193.0, + "reward": 1.8434065580368042, + "reward_std": 0.2161194235086441, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8434065580368042, + "rewards/fixed_code_pass_all_test_reward/std": 0.2161194235086441, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 465.375, + "completions/mean_terminated_length": 465.375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.8149787862018077, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.81640625, + "kl": 0.033116265665739775, + "learning_rate": 1.4776232270923771e-05, + "loss": 0.0013, + "num_tokens": 36861916.0, + "reward": 1.8494318723678589, + "reward_std": 0.24566414952278137, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8494318723678589, + "rewards/fixed_code_pass_all_test_reward/std": 0.24566416442394257, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 411.0, + "completions/mean_terminated_length": 411.0, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.8151632540121748, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11767578125, + "kl": 0.04929734254255891, + "learning_rate": 1.4773403190116845e-05, + "loss": 0.002, + "num_tokens": 36874172.0, + "reward": 1.454545497894287, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4545454680919647, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 340.125, + "completions/mean_terminated_length": 340.125, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.815347721822542, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.51171875, + "kl": 0.07296629901975393, + "learning_rate": 1.4770573614435218e-05, + "loss": 0.0029, + "num_tokens": 36881533.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 529.75, + "completions/mean_terminated_length": 529.75, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.8155321896329091, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.04199667274951935, + "learning_rate": 1.476774354417224e-05, + "loss": 0.0017, + "num_tokens": 36891859.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 361.0, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.8157166574432761, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.029323241906240582, + "learning_rate": 1.4764912979621321e-05, + "loss": 0.0012, + "num_tokens": 36899139.0, + "reward": 1.8382352590560913, + "reward_std": 0.174357071518898, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8382353186607361, + "rewards/fixed_code_pass_all_test_reward/std": 0.1743570864200592, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 338.0, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8159011252536432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.02728944446425885, + "learning_rate": 1.4762081921075912e-05, + "loss": 0.0011, + "num_tokens": 36906275.0, + "reward": 1.9886363744735718, + "reward_std": 0.03214118629693985, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9886363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.03214120864868164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 615.0, + "completions/max_terminated_length": 615.0, + "completions/mean_length": 373.75, + "completions/mean_terminated_length": 373.75, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.8160855930640103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.05139522533863783, + "learning_rate": 1.4759250368829519e-05, + "loss": 0.0021, + "num_tokens": 36915193.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 220.75, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.8162700608743774, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.078125, + "kl": 0.029186043422669172, + "learning_rate": 1.4756418323175691e-05, + "loss": 0.0012, + "num_tokens": 36920511.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 282.625, + "completions/mean_terminated_length": 282.625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.8164545286847446, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043701171875, + "kl": 0.031119376653805375, + "learning_rate": 1.4753585784408049e-05, + "loss": 0.0012, + "num_tokens": 36930364.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 420.875, + "completions/mean_terminated_length": 420.875, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.8166389964951116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9296875, + "kl": 0.040793149964883924, + "learning_rate": 1.475075275282024e-05, + "loss": 0.0016, + "num_tokens": 36938603.0, + "reward": 1.25, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 357.25, + "completions/mean_terminated_length": 357.25, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.8168234643054787, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.04774754913523793, + "learning_rate": 1.4747919228705982e-05, + "loss": 0.0019, + "num_tokens": 36949549.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 484.0, + "completions/mean_terminated_length": 484.0, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.8170079321158458, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.027475686511024833, + "learning_rate": 1.474508521235903e-05, + "loss": 0.0011, + "num_tokens": 36961421.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.8171923999262128, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4375, + "kl": 0.060472844168543816, + "learning_rate": 1.4742250704073199e-05, + "loss": 0.0024, + "num_tokens": 36969468.0, + "reward": 1.8017241954803467, + "reward_std": 0.3834303915500641, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8017241358757019, + "rewards/fixed_code_pass_all_test_reward/std": 0.3834303915500641, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 304.125, + "completions/mean_terminated_length": 304.125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8173768677365799, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.84375, + "kl": 0.26671713683754206, + "learning_rate": 1.4739415704142352e-05, + "loss": 0.0107, + "num_tokens": 36979213.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 167.875, + "completions/mean_terminated_length": 167.875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.8175613355469471, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8046875, + "kl": 0.06681014341302216, + "learning_rate": 1.4736580212860405e-05, + "loss": 0.0027, + "num_tokens": 36983604.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 313.625, + "completions/mean_terminated_length": 313.625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.8177458033573142, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.0629424867220223, + "learning_rate": 1.4733744230521314e-05, + "loss": 0.0025, + "num_tokens": 36990185.0, + "reward": 1.1304347515106201, + "reward_std": 0.1859208345413208, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1304347813129425, + "rewards/fixed_code_pass_all_test_reward/std": 0.1859208643436432, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 282.625, + "completions/mean_terminated_length": 282.625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8179302711676812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.04263966716825962, + "learning_rate": 1.4730907757419108e-05, + "loss": 0.0017, + "num_tokens": 36998510.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 404.625, + "completions/mean_terminated_length": 404.625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.8181147389780483, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.02421852620318532, + "learning_rate": 1.4728070793847842e-05, + "loss": 0.001, + "num_tokens": 37006195.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 225.875, + "completions/mean_terminated_length": 225.875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.8182992067884154, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.0183974995452445, + "learning_rate": 1.4725233340101641e-05, + "loss": 0.0007, + "num_tokens": 37011738.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 340.0, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8184836745987825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.111328125, + "kl": 0.05790543183684349, + "learning_rate": 1.4722395396474668e-05, + "loss": 0.0023, + "num_tokens": 37018754.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 329.625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.8186681424091496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.921875, + "kl": 0.04577395226806402, + "learning_rate": 1.4719556963261148e-05, + "loss": 0.0018, + "num_tokens": 37026687.0, + "reward": 1.6875, + "reward_std": 0.2587745785713196, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.25877460837364197, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 342.875, + "completions/mean_terminated_length": 342.875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.8188526102195167, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.06463017547503114, + "learning_rate": 1.4716718040755346e-05, + "loss": 0.0026, + "num_tokens": 37036982.0, + "reward": 1.466397762298584, + "reward_std": 0.3072623610496521, + "rewards/fixed_code_pass_all_test_reward/mean": 0.46639782190322876, + "rewards/fixed_code_pass_all_test_reward/std": 0.3072623908519745, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 453.125, + "completions/mean_terminated_length": 453.125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.8190370780298838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.015625, + "kl": 0.05264505883678794, + "learning_rate": 1.4713878629251584e-05, + "loss": 0.0021, + "num_tokens": 37045287.0, + "reward": 1.7259615659713745, + "reward_std": 0.45295941829681396, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7259615659713745, + "rewards/fixed_code_pass_all_test_reward/std": 0.45295944809913635, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8192215458402509, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.75, + "kl": 0.048185139428824186, + "learning_rate": 1.4711038729044233e-05, + "loss": 0.0019, + "num_tokens": 37049804.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 297.625, + "completions/mean_terminated_length": 297.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8194060136506179, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07275390625, + "kl": 0.052363231778144836, + "learning_rate": 1.4708198340427719e-05, + "loss": 0.0021, + "num_tokens": 37055841.0, + "reward": 1.3636363744735718, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3636363744735718, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 318.625, + "completions/mean_terminated_length": 318.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.819590481460985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.032014787779189646, + "learning_rate": 1.4705357463696509e-05, + "loss": 0.0013, + "num_tokens": 37064006.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 269.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.8197749492713522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.023892402183264494, + "learning_rate": 1.4702516099145126e-05, + "loss": 0.001, + "num_tokens": 37072773.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.8199594170817193, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7890625, + "kl": 0.05666707525961101, + "learning_rate": 1.4699674247068147e-05, + "loss": 0.0023, + "num_tokens": 37081394.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 261.25, + "completions/mean_terminated_length": 261.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.8201438848920863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.05196774681098759, + "learning_rate": 1.4696831907760198e-05, + "loss": 0.0021, + "num_tokens": 37087676.0, + "reward": 1.7737069129943848, + "reward_std": 0.1458437144756317, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7737069129943848, + "rewards/fixed_code_pass_all_test_reward/std": 0.14584369957447052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 369.25, + "completions/mean_terminated_length": 369.25, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.8203283527024534, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.028647802071645856, + "learning_rate": 1.469398908151595e-05, + "loss": 0.0011, + "num_tokens": 37094806.0, + "reward": 1.4166667461395264, + "reward_std": 0.235702246427536, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 165.5, + "completions/mean_terminated_length": 165.5, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8205128205128205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17578125, + "kl": 0.07041107444092631, + "learning_rate": 1.4691145768630128e-05, + "loss": 0.0028, + "num_tokens": 37098970.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 298.5, + "completions/mean_terminated_length": 298.5, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.8206972883231876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.05504255369305611, + "learning_rate": 1.4688301969397511e-05, + "loss": 0.0022, + "num_tokens": 37105614.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 298.875, + "completions/mean_terminated_length": 298.875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.8208817561335547, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.05268324865028262, + "learning_rate": 1.4685457684112925e-05, + "loss": 0.0021, + "num_tokens": 37115061.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 314.75, + "completions/mean_terminated_length": 314.75, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.8210662239439218, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.546875, + "kl": 0.2285073499660939, + "learning_rate": 1.4682612913071244e-05, + "loss": 0.0091, + "num_tokens": 37122803.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.8212506917542889, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0284423828125, + "kl": 0.02116668305825442, + "learning_rate": 1.4679767656567392e-05, + "loss": 0.0008, + "num_tokens": 37130103.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 504.125, + "completions/mean_terminated_length": 504.125, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.821435159564656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.03326649544760585, + "learning_rate": 1.4676921914896355e-05, + "loss": 0.0013, + "num_tokens": 37139632.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 198.875, + "completions/mean_terminated_length": 198.875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.821619627375023, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.05363059765659273, + "learning_rate": 1.4674075688353155e-05, + "loss": 0.0021, + "num_tokens": 37144271.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 328.625, + "completions/mean_terminated_length": 328.625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.8218040951853901, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.04232618445530534, + "learning_rate": 1.4671228977232871e-05, + "loss": 0.0017, + "num_tokens": 37151348.0, + "reward": 1.8010203838348389, + "reward_std": 0.14170719683170319, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8010203838348389, + "rewards/fixed_code_pass_all_test_reward/std": 0.1417071521282196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 223.375, + "completions/mean_terminated_length": 223.375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.8219885629957573, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.03401131136342883, + "learning_rate": 1.466838178183063e-05, + "loss": 0.0014, + "num_tokens": 37156471.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 501.75, + "completions/mean_terminated_length": 501.75, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.8221730308061244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.796875, + "kl": 0.03302102710586041, + "learning_rate": 1.4665534102441611e-05, + "loss": 0.0013, + "num_tokens": 37168013.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 520.375, + "completions/mean_terminated_length": 520.375, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.8223574986164914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.020508441841229796, + "learning_rate": 1.4662685939361043e-05, + "loss": 0.0008, + "num_tokens": 37180696.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 254.625, + "completions/mean_terminated_length": 254.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8225419664268585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.025889663957059383, + "learning_rate": 1.4659837292884204e-05, + "loss": 0.001, + "num_tokens": 37186829.0, + "reward": 1.8793103694915771, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8793103694915771, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 257.375, + "completions/mean_terminated_length": 257.375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.8227264342372256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.026947322534397244, + "learning_rate": 1.4656988163306422e-05, + "loss": 0.0011, + "num_tokens": 37192672.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 323.625, + "completions/mean_terminated_length": 323.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.8229109020475927, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.06098004523664713, + "learning_rate": 1.465413855092308e-05, + "loss": 0.0024, + "num_tokens": 37202269.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 364.0, + "completions/mean_terminated_length": 364.0, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.8230953698579598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.91015625, + "kl": 0.018815435119904578, + "learning_rate": 1.4651288456029602e-05, + "loss": 0.0008, + "num_tokens": 37209869.0, + "reward": 1.93478262424469, + "reward_std": 0.040253035724163055, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9347826242446899, + "rewards/fixed_code_pass_all_test_reward/std": 0.040253039449453354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 415.375, + "completions/mean_terminated_length": 415.375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.8232798376683269, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03125, + "kl": 0.02236573596019298, + "learning_rate": 1.4648437878921466e-05, + "loss": 0.0009, + "num_tokens": 37218288.0, + "reward": 1.476190447807312, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4761904776096344, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 178.875, + "completions/mean_terminated_length": 178.875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.823464305478694, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.02983657654840499, + "learning_rate": 1.4645586819894204e-05, + "loss": 0.0012, + "num_tokens": 37222719.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.823648773289061, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.024180538137443364, + "learning_rate": 1.4642735279243398e-05, + "loss": 0.001, + "num_tokens": 37228740.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 394.875, + "completions/mean_terminated_length": 394.875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.8238332410994281, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10205078125, + "kl": 0.05122938007116318, + "learning_rate": 1.4639883257264669e-05, + "loss": 0.002, + "num_tokens": 37237939.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 275.5, + "completions/mean_terminated_length": 275.5, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.8240177089097952, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.031123207532800734, + "learning_rate": 1.4637030754253703e-05, + "loss": 0.0012, + "num_tokens": 37243967.0, + "reward": 1.9874999523162842, + "reward_std": 0.035355329513549805, + "rewards/fixed_code_pass_all_test_reward/mean": 0.987500011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.0353553481400013, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 398.375, + "completions/mean_terminated_length": 398.375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.8242021767201624, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.25, + "kl": 0.0388511479832232, + "learning_rate": 1.463417777050622e-05, + "loss": 0.0016, + "num_tokens": 37254458.0, + "reward": 1.7213854789733887, + "reward_std": 0.05882582813501358, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7213855385780334, + "rewards/fixed_code_pass_all_test_reward/std": 0.05882588401436806, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 266.625, + "completions/mean_terminated_length": 266.625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.8243866445305295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061767578125, + "kl": 0.040384803898632526, + "learning_rate": 1.4631324306318009e-05, + "loss": 0.0016, + "num_tokens": 37260095.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 223.5, + "completions/mean_terminated_length": 223.5, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.8245711123408965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053466796875, + "kl": 0.025733416783623397, + "learning_rate": 1.462847036198489e-05, + "loss": 0.001, + "num_tokens": 37265827.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 237.25, + "completions/mean_terminated_length": 237.25, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.8247555801512636, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.0459861836861819, + "learning_rate": 1.4625615937802745e-05, + "loss": 0.0018, + "num_tokens": 37271613.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 324.625, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.8249400479616307, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.051257923943921924, + "learning_rate": 1.4622761034067499e-05, + "loss": 0.0021, + "num_tokens": 37278610.0, + "reward": 1.5714285373687744, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285969734192, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 306.875, + "completions/mean_terminated_length": 306.875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8251245157719977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056396484375, + "kl": 0.04899001447483897, + "learning_rate": 1.4619905651075132e-05, + "loss": 0.002, + "num_tokens": 37287809.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 212.875, + "completions/mean_terminated_length": 212.875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.8253089835823649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87890625, + "kl": 0.02402504440397024, + "learning_rate": 1.461704978912167e-05, + "loss": 0.001, + "num_tokens": 37292688.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 315.625, + "completions/mean_terminated_length": 315.625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.825493451392732, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.04842424509115517, + "learning_rate": 1.4614193448503189e-05, + "loss": 0.0019, + "num_tokens": 37300957.0, + "reward": 1.9375, + "reward_std": 0.1767766922712326, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9375, + "rewards/fixed_code_pass_all_test_reward/std": 0.1767766922712326, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 267.125, + "completions/mean_terminated_length": 267.125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8256779192030991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2431640625, + "kl": 0.08185578417032957, + "learning_rate": 1.4611336629515818e-05, + "loss": 0.0033, + "num_tokens": 37307206.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 162.875, + "completions/mean_terminated_length": 162.875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.8258623870134661, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0966796875, + "kl": 0.02131652587559074, + "learning_rate": 1.4608479332455729e-05, + "loss": 0.0009, + "num_tokens": 37311381.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 277.625, + "completions/mean_terminated_length": 277.625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.8260468548238332, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0390625, + "kl": 0.020436199265532196, + "learning_rate": 1.460562155761915e-05, + "loss": 0.0008, + "num_tokens": 37321226.0, + "reward": 1.9276316165924072, + "reward_std": 0.13400031626224518, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9276315569877625, + "rewards/fixed_code_pass_all_test_reward/std": 0.1340002864599228, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 231.125, + "completions/mean_terminated_length": 231.125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.8262313226342003, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.050412192242220044, + "learning_rate": 1.4602763305302357e-05, + "loss": 0.002, + "num_tokens": 37331387.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 166.25, + "completions/mean_terminated_length": 166.25, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.8264157904445675, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.06437540682964027, + "learning_rate": 1.4599904575801673e-05, + "loss": 0.0026, + "num_tokens": 37335589.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 448.375, + "completions/mean_terminated_length": 448.375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.8266002582549346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.03262524155434221, + "learning_rate": 1.4597045369413471e-05, + "loss": 0.0013, + "num_tokens": 37344272.0, + "reward": 1.6666667461395264, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.8267847260653016, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.050574913155287504, + "learning_rate": 1.4594185686434176e-05, + "loss": 0.002, + "num_tokens": 37352250.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 371.625, + "completions/mean_terminated_length": 371.625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.8269691938756687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08349609375, + "kl": 0.03438901528716087, + "learning_rate": 1.4591325527160262e-05, + "loss": 0.0014, + "num_tokens": 37362359.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 187.125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.8271536616860358, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28515625, + "kl": 0.05558170052245259, + "learning_rate": 1.4588464891888252e-05, + "loss": 0.0022, + "num_tokens": 37369624.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 135.375, + "completions/mean_terminated_length": 135.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.8273381294964028, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10498046875, + "kl": 0.03641491453163326, + "learning_rate": 1.4585603780914714e-05, + "loss": 0.0015, + "num_tokens": 37373443.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 466.25, + "completions/mean_terminated_length": 466.25, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.8275225973067699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.035306291887536645, + "learning_rate": 1.458274219453627e-05, + "loss": 0.0014, + "num_tokens": 37384685.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 245.375, + "completions/mean_terminated_length": 245.375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.8277070651171371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.025669215945526958, + "learning_rate": 1.457988013304959e-05, + "loss": 0.001, + "num_tokens": 37389776.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 100.625, + "completions/mean_terminated_length": 100.625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.8278915329275042, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.040187884122133255, + "learning_rate": 1.4577017596751399e-05, + "loss": 0.0016, + "num_tokens": 37393285.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 405.0, + "completions/mean_terminated_length": 405.0, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.8280760007378712, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8828125, + "kl": 0.0373152126558125, + "learning_rate": 1.4574154585938458e-05, + "loss": 0.0015, + "num_tokens": 37401221.0, + "reward": 1.8223683834075928, + "reward_std": 0.24515488743782043, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8223684430122375, + "rewards/fixed_code_pass_all_test_reward/std": 0.24515485763549805, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 334.5, + "completions/mean_terminated_length": 334.5, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.8282604685482383, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.055354381911456585, + "learning_rate": 1.457129110090759e-05, + "loss": 0.0022, + "num_tokens": 37410785.0, + "reward": 1.703125, + "reward_std": 0.3199993073940277, + "rewards/fixed_code_pass_all_test_reward/mean": 0.703125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3199993073940277, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 293.25, + "completions/mean_terminated_length": 293.25, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.8284449363586054, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.04033209034241736, + "learning_rate": 1.4568427141955656e-05, + "loss": 0.0016, + "num_tokens": 37419099.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 344.375, + "completions/mean_terminated_length": 344.375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.8286294041689725, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.0659853364340961, + "learning_rate": 1.4565562709379581e-05, + "loss": 0.0026, + "num_tokens": 37426094.0, + "reward": 1.8263888359069824, + "reward_std": 0.32146528363227844, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8263888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.3214653432369232, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 287.75, + "completions/mean_terminated_length": 287.75, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.8288138719793396, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.060248988680541515, + "learning_rate": 1.4562697803476325e-05, + "loss": 0.0024, + "num_tokens": 37437484.0, + "reward": 1.2053570747375488, + "reward_std": 0.11729146540164948, + "rewards/fixed_code_pass_all_test_reward/mean": 0.205357164144516, + "rewards/fixed_code_pass_all_test_reward/std": 0.11729148030281067, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.8289983397897067, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.828125, + "kl": 0.05441493587568402, + "learning_rate": 1.4559832424542901e-05, + "loss": 0.0022, + "num_tokens": 37443550.0, + "reward": 1.225000023841858, + "reward_std": 0.2492847442626953, + "rewards/fixed_code_pass_all_test_reward/mean": 0.22500000894069672, + "rewards/fixed_code_pass_all_test_reward/std": 0.24928469955921173, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 358.375, + "completions/mean_terminated_length": 358.375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.8291828076000738, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.04032497154548764, + "learning_rate": 1.4556966572876377e-05, + "loss": 0.0016, + "num_tokens": 37451073.0, + "reward": 1.0, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 230.5, + "completions/mean_terminated_length": 230.5, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.8293672754104409, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.023513261345215142, + "learning_rate": 1.4554100248773863e-05, + "loss": 0.0009, + "num_tokens": 37456085.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 270.125, + "completions/mean_terminated_length": 270.125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.8295517432208079, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.04222102393396199, + "learning_rate": 1.4551233452532518e-05, + "loss": 0.0017, + "num_tokens": 37462326.0, + "reward": 1.7702206373214722, + "reward_std": 0.10918562859296799, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7702206373214722, + "rewards/fixed_code_pass_all_test_reward/std": 0.1091856062412262, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 325.5, + "completions/mean_terminated_length": 325.5, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.829736211031175, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0419921875, + "kl": 0.022089880891144276, + "learning_rate": 1.4548366184449556e-05, + "loss": 0.0009, + "num_tokens": 37469354.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 149.5, + "completions/mean_terminated_length": 149.5, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.8299206788415422, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.73828125, + "kl": 0.09100817097350955, + "learning_rate": 1.4545498444822237e-05, + "loss": 0.0036, + "num_tokens": 37473766.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 330.25, + "completions/mean_terminated_length": 330.25, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.8301051466519093, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047119140625, + "kl": 0.022257568780332804, + "learning_rate": 1.4542630233947868e-05, + "loss": 0.0009, + "num_tokens": 37480872.0, + "reward": 1.375, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 144.375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8302896144622763, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.859375, + "kl": 0.0736381453461945, + "learning_rate": 1.4539761552123803e-05, + "loss": 0.0029, + "num_tokens": 37484763.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 176.25, + "completions/mean_terminated_length": 176.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.8304740822726434, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.0625, + "kl": 0.04495581751689315, + "learning_rate": 1.4536892399647449e-05, + "loss": 0.0018, + "num_tokens": 37488909.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 322.25, + "completions/mean_terminated_length": 322.25, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.8306585500830105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.048828182741999626, + "learning_rate": 1.4534022776816264e-05, + "loss": 0.002, + "num_tokens": 37499287.0, + "reward": 1.8954546451568604, + "reward_std": 0.03856949508190155, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8954545259475708, + "rewards/fixed_code_pass_all_test_reward/std": 0.03856946527957916, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 315.625, + "completions/mean_terminated_length": 315.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.8308430178933776, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.033572531305253506, + "learning_rate": 1.4531152683927749e-05, + "loss": 0.0013, + "num_tokens": 37506220.0, + "reward": 1.960714340209961, + "reward_std": 0.08215394616127014, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9607142806053162, + "rewards/fixed_code_pass_all_test_reward/std": 0.08215394616127014, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.8310274857037447, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.515625, + "kl": 0.09441158547997475, + "learning_rate": 1.4528282121279455e-05, + "loss": 0.0038, + "num_tokens": 37515459.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 169.625, + "completions/mean_terminated_length": 169.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.8312119535141118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.205078125, + "kl": 0.08699578884989023, + "learning_rate": 1.4525411089168986e-05, + "loss": 0.0035, + "num_tokens": 37522272.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 185.125, + "completions/mean_terminated_length": 185.125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.8313964213244789, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.173828125, + "kl": 0.04309878475032747, + "learning_rate": 1.4522539587893988e-05, + "loss": 0.0017, + "num_tokens": 37526673.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 217.0, + "completions/mean_terminated_length": 217.0, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.831580889134846, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05224609375, + "kl": 0.04440365731716156, + "learning_rate": 1.4519667617752164e-05, + "loss": 0.0018, + "num_tokens": 37532961.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 359.625, + "completions/mean_terminated_length": 359.625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.831765356945213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12158203125, + "kl": 0.05035535246133804, + "learning_rate": 1.4516795179041255e-05, + "loss": 0.002, + "num_tokens": 37543174.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 466.625, + "completions/mean_terminated_length": 466.625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.8319498247555801, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.03411020780913532, + "learning_rate": 1.451392227205906e-05, + "loss": 0.0014, + "num_tokens": 37552275.0, + "reward": 1.723557710647583, + "reward_std": 0.3541412949562073, + "rewards/fixed_code_pass_all_test_reward/mean": 0.723557710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.35414132475852966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 259.0, + "completions/mean_terminated_length": 259.0, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.8321342925659473, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.02259766194038093, + "learning_rate": 1.4511048897103423e-05, + "loss": 0.0009, + "num_tokens": 37557771.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 208.0, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.8323187603763144, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.05405281111598015, + "learning_rate": 1.4508175054472233e-05, + "loss": 0.0022, + "num_tokens": 37563443.0, + "reward": 1.965517282485962, + "reward_std": 0.0975319966673851, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9655172228813171, + "rewards/fixed_code_pass_all_test_reward/std": 0.09753198176622391, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 364.625, + "completions/mean_terminated_length": 364.625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.8325032281866814, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.546875, + "kl": 0.08603674033656716, + "learning_rate": 1.4505300744463435e-05, + "loss": 0.0034, + "num_tokens": 37574208.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 176.125, + "completions/mean_terminated_length": 176.125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8326876959970485, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.025676840625237674, + "learning_rate": 1.4502425967375016e-05, + "loss": 0.001, + "num_tokens": 37581985.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 379.5, + "completions/mean_terminated_length": 379.5, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.8328721638074156, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.01952862663893029, + "learning_rate": 1.4499550723505014e-05, + "loss": 0.0008, + "num_tokens": 37590389.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 310.75, + "completions/mean_terminated_length": 310.75, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.8330566316177827, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.06135472655296326, + "learning_rate": 1.4496675013151516e-05, + "loss": 0.0025, + "num_tokens": 37600643.0, + "reward": 1.9583333730697632, + "reward_std": 0.11785109341144562, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9583333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.117851123213768, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 462.375, + "completions/mean_terminated_length": 462.375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.8332410994281498, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.048348306911066175, + "learning_rate": 1.4493798836612656e-05, + "loss": 0.0019, + "num_tokens": 37609910.0, + "reward": 1.8250000476837158, + "reward_std": 0.36154431104660034, + "rewards/fixed_code_pass_all_test_reward/mean": 0.824999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.36154431104660034, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 248.875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.8334255672385169, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3984375, + "kl": 0.05179552035406232, + "learning_rate": 1.4490922194186614e-05, + "loss": 0.0021, + "num_tokens": 37618261.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 258.875, + "completions/mean_terminated_length": 258.875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.833610035048884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.04271516017615795, + "learning_rate": 1.4488045086171629e-05, + "loss": 0.0017, + "num_tokens": 37623868.0, + "reward": 1.96875, + "reward_std": 0.0578637570142746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 621.25, + "completions/mean_terminated_length": 621.25, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.833794502859251, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6171875, + "kl": 0.023189717903733253, + "learning_rate": 1.4485167512865972e-05, + "loss": 0.0009, + "num_tokens": 37636926.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 514.25, + "completions/mean_terminated_length": 514.25, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.8339789706696181, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.03334027389064431, + "learning_rate": 1.4482289474567975e-05, + "loss": 0.0013, + "num_tokens": 37646352.0, + "reward": 1.866666555404663, + "reward_std": 0.13213752210140228, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8666666746139526, + "rewards/fixed_code_pass_all_test_reward/std": 0.1321374922990799, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 713.25, + "completions/mean_terminated_length": 713.25, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.8341634384799852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.65234375, + "kl": 0.0185983584378846, + "learning_rate": 1.4479410971576013e-05, + "loss": 0.0007, + "num_tokens": 37659242.0, + "reward": 1.5340908765792847, + "reward_std": 0.498965859413147, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.49896588921546936, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.8343479062903524, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.041237954050302505, + "learning_rate": 1.4476532004188512e-05, + "loss": 0.0016, + "num_tokens": 37669530.0, + "reward": 1.564814805984497, + "reward_std": 0.12001370638608932, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5648148059844971, + "rewards/fixed_code_pass_all_test_reward/std": 0.12001372873783112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 177.0, + "completions/mean_terminated_length": 177.0, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.8345323741007195, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.05169519118499011, + "learning_rate": 1.447365257270394e-05, + "loss": 0.0021, + "num_tokens": 37674394.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 229.625, + "completions/mean_terminated_length": 229.625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.8347168419110865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.064453125, + "kl": 0.040332772536203265, + "learning_rate": 1.447077267742082e-05, + "loss": 0.0016, + "num_tokens": 37681959.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 279.875, + "completions/mean_terminated_length": 279.875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8349013097214536, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.115234375, + "kl": 0.06222177390009165, + "learning_rate": 1.446789231863772e-05, + "loss": 0.0025, + "num_tokens": 37691022.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 299.75, + "completions/mean_terminated_length": 299.75, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.8350857775318207, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.05941414460539818, + "learning_rate": 1.4465011496653259e-05, + "loss": 0.0024, + "num_tokens": 37697548.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 196.625, + "completions/mean_terminated_length": 196.625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.8352702453421877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09619140625, + "kl": 0.033833860885351896, + "learning_rate": 1.4462130211766094e-05, + "loss": 0.0014, + "num_tokens": 37703305.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 472.5, + "completions/mean_terminated_length": 472.5, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.8354547131525549, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.9453125, + "kl": 0.06556448247283697, + "learning_rate": 1.4459248464274944e-05, + "loss": 0.0026, + "num_tokens": 37712789.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 417.375, + "completions/mean_terminated_length": 417.375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.835639180962922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.033427080838009715, + "learning_rate": 1.4456366254478569e-05, + "loss": 0.0013, + "num_tokens": 37721176.0, + "reward": 1.8620131015777588, + "reward_std": 0.11426407098770142, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8620129823684692, + "rewards/fixed_code_pass_all_test_reward/std": 0.114264115691185, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 481.625, + "completions/mean_terminated_length": 481.625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.8358236487732891, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.63671875, + "kl": 0.014388896175660193, + "learning_rate": 1.4453483582675775e-05, + "loss": 0.0006, + "num_tokens": 37729397.0, + "reward": 1.6875, + "reward_std": 0.6373774409294128, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 335.5, + "completions/mean_terminated_length": 335.5, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.8360081165836561, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036865234375, + "kl": 0.018829017062671483, + "learning_rate": 1.4450600449165421e-05, + "loss": 0.0008, + "num_tokens": 37735873.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 207.0, + "completions/mean_terminated_length": 207.0, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.8361925843940232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.03819097252562642, + "learning_rate": 1.4447716854246408e-05, + "loss": 0.0015, + "num_tokens": 37740401.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 350.125, + "completions/mean_terminated_length": 350.125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.8363770522043903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.76171875, + "kl": 0.023557094973511994, + "learning_rate": 1.444483279821769e-05, + "loss": 0.0009, + "num_tokens": 37747458.0, + "reward": 1.8472222089767456, + "reward_std": 0.25153848528862, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8472222089767456, + "rewards/fixed_code_pass_all_test_reward/std": 0.25153848528862, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 254.875, + "completions/mean_terminated_length": 254.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.8365615200147575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.03345900750719011, + "learning_rate": 1.4441948281378266e-05, + "loss": 0.0013, + "num_tokens": 37752433.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 319.5, + "completions/mean_terminated_length": 319.5, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.8367459878251245, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0888671875, + "kl": 0.055046494118869305, + "learning_rate": 1.4439063304027183e-05, + "loss": 0.0022, + "num_tokens": 37763965.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 141.5, + "completions/mean_terminated_length": 141.5, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8369304556354916, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2119140625, + "kl": 0.08972206944599748, + "learning_rate": 1.4436177866463537e-05, + "loss": 0.0036, + "num_tokens": 37768001.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 273.125, + "completions/mean_terminated_length": 273.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.8371149234458587, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.05695320665836334, + "learning_rate": 1.4433291968986474e-05, + "loss": 0.0023, + "num_tokens": 37777186.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 186.375, + "completions/mean_terminated_length": 186.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.8372993912562258, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046875, + "kl": 0.034970608074218035, + "learning_rate": 1.4430405611895177e-05, + "loss": 0.0014, + "num_tokens": 37784045.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 367.375, + "completions/mean_terminated_length": 367.375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.8374838590665928, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.04002674319781363, + "learning_rate": 1.4427518795488891e-05, + "loss": 0.0016, + "num_tokens": 37791720.0, + "reward": 1.9186046123504639, + "reward_std": 0.09866604208946228, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9186046123504639, + "rewards/fixed_code_pass_all_test_reward/std": 0.09866604208946228, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 328.25, + "completions/mean_terminated_length": 328.25, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.83766832687696, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.04765365784987807, + "learning_rate": 1.4424631520066899e-05, + "loss": 0.0019, + "num_tokens": 37797298.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 164.375, + "completions/mean_terminated_length": 164.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8378527946873271, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1162109375, + "kl": 0.051682702731341124, + "learning_rate": 1.4421743785928536e-05, + "loss": 0.0021, + "num_tokens": 37803285.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 400.125, + "completions/mean_terminated_length": 400.125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.8380372624976942, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.02960884536150843, + "learning_rate": 1.4418855593373182e-05, + "loss": 0.0012, + "num_tokens": 37811630.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 291.25, + "completions/mean_terminated_length": 291.25, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.8382217303080612, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.59375, + "kl": 0.06615860527381301, + "learning_rate": 1.4415966942700266e-05, + "loss": 0.0026, + "num_tokens": 37820088.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8384061981184283, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.203125, + "kl": 0.040938820922747254, + "learning_rate": 1.4413077834209263e-05, + "loss": 0.0016, + "num_tokens": 37825431.0, + "reward": 1.899999976158142, + "reward_std": 0.2828426957130432, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.2828427255153656, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 421.375, + "completions/mean_terminated_length": 421.375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.8385906659287954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.034580805571749806, + "learning_rate": 1.4410188268199701e-05, + "loss": 0.0014, + "num_tokens": 37835954.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 207.0, + "completions/mean_terminated_length": 207.0, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.8387751337391626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11083984375, + "kl": 0.1285075508058071, + "learning_rate": 1.4407298244971144e-05, + "loss": 0.0051, + "num_tokens": 37840682.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 289.125, + "completions/mean_terminated_length": 289.125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.8389596015495296, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.022204914945177734, + "learning_rate": 1.4404407764823216e-05, + "loss": 0.0009, + "num_tokens": 37847211.0, + "reward": 1.379807710647583, + "reward_std": 0.421042799949646, + "rewards/fixed_code_pass_all_test_reward/mean": 0.379807710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.4210428297519684, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 357.75, + "completions/mean_terminated_length": 357.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.8391440693598967, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.162109375, + "kl": 0.06957817077636719, + "learning_rate": 1.4401516828055579e-05, + "loss": 0.0028, + "num_tokens": 37856585.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 319.625, + "completions/mean_terminated_length": 319.625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.8393285371702638, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.03513953648507595, + "learning_rate": 1.439862543496795e-05, + "loss": 0.0014, + "num_tokens": 37863846.0, + "reward": 1.5714285373687744, + "reward_std": 0.3211449980735779, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5714285373687744, + "rewards/fixed_code_pass_all_test_reward/std": 0.3211449980735779, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 346.5, + "completions/mean_terminated_length": 346.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.8395130049806309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.03687436110340059, + "learning_rate": 1.4395733585860088e-05, + "loss": 0.0015, + "num_tokens": 37876826.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 317.5, + "completions/mean_terminated_length": 317.5, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.8396974727909979, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.037704933900386095, + "learning_rate": 1.4392841281031796e-05, + "loss": 0.0015, + "num_tokens": 37883662.0, + "reward": 1.52173912525177, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.52173912525177, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1190.0, + "completions/max_terminated_length": 1190.0, + "completions/mean_length": 641.75, + "completions/mean_terminated_length": 641.75, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.839881940601365, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.03241912368685007, + "learning_rate": 1.4389948520782934e-05, + "loss": 0.0013, + "num_tokens": 37896684.0, + "reward": 1.9848484992980957, + "reward_std": 0.02805512771010399, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9848484992980957, + "rewards/fixed_code_pass_all_test_reward/std": 0.028055155649781227, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 440.625, + "completions/mean_terminated_length": 440.625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.8400664084117322, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05810546875, + "kl": 0.03254800895228982, + "learning_rate": 1.4387055305413406e-05, + "loss": 0.0013, + "num_tokens": 37905569.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 512.75, + "completions/mean_terminated_length": 512.75, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.8402508762220993, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.05436923401430249, + "learning_rate": 1.4384161635223157e-05, + "loss": 0.0022, + "num_tokens": 37919575.0, + "reward": 1.6726189851760864, + "reward_std": 0.12582732737064362, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6726190447807312, + "rewards/fixed_code_pass_all_test_reward/std": 0.125827357172966, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.8404353440324663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.02542793843895197, + "learning_rate": 1.4381267510512182e-05, + "loss": 0.001, + "num_tokens": 37928674.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.8406198118428334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.02108191279694438, + "learning_rate": 1.4378372931580531e-05, + "loss": 0.0008, + "num_tokens": 37939228.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 635.375, + "completions/mean_terminated_length": 635.375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "epoch": 0.8408042796532005, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.97265625, + "kl": 0.02184521418530494, + "learning_rate": 1.437547789872829e-05, + "loss": 0.0009, + "num_tokens": 37953463.0, + "reward": 1.8888888359069824, + "reward_std": 0.31426966190338135, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 155.875, + "completions/mean_terminated_length": 155.875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.8409887474635676, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32421875, + "kl": 0.08385562896728516, + "learning_rate": 1.4372582412255599e-05, + "loss": 0.0034, + "num_tokens": 37957518.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 385.25, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.8411732152739347, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3515625, + "kl": 0.05616301903501153, + "learning_rate": 1.4369686472462639e-05, + "loss": 0.0022, + "num_tokens": 37967176.0, + "reward": 1.724759578704834, + "reward_std": 0.29354214668273926, + "rewards/fixed_code_pass_all_test_reward/mean": 0.724759578704834, + "rewards/fixed_code_pass_all_test_reward/std": 0.29354214668273926, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 376.5, + "completions/mean_terminated_length": 376.5, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.8413576830843018, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.06292706169188023, + "learning_rate": 1.4366790079649646e-05, + "loss": 0.0025, + "num_tokens": 37974908.0, + "reward": 1.4632353782653809, + "reward_std": 0.3392840325832367, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4632353186607361, + "rewards/fixed_code_pass_all_test_reward/std": 0.33928412199020386, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.8415421508946689, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.15625, + "kl": 0.05460579879581928, + "learning_rate": 1.4363893234116897e-05, + "loss": 0.0022, + "num_tokens": 37980651.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.841726618705036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.036561060696840286, + "learning_rate": 1.4360995936164718e-05, + "loss": 0.0015, + "num_tokens": 37985879.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 229.125, + "completions/mean_terminated_length": 229.125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.841911086515403, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7109375, + "kl": 0.048594933934509754, + "learning_rate": 1.4358098186093481e-05, + "loss": 0.0019, + "num_tokens": 37994744.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 235.5, + "completions/mean_terminated_length": 235.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8420955543257701, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1337890625, + "kl": 0.037993370671756566, + "learning_rate": 1.4355199984203607e-05, + "loss": 0.0015, + "num_tokens": 38001868.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 226.5, + "completions/mean_terminated_length": 226.5, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.8422800221361373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.03935114876367152, + "learning_rate": 1.4352301330795562e-05, + "loss": 0.0016, + "num_tokens": 38007880.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 347.375, + "completions/mean_terminated_length": 347.375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.8424644899465044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.04007950960658491, + "learning_rate": 1.4349402226169856e-05, + "loss": 0.0016, + "num_tokens": 38015339.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 207.375, + "completions/mean_terminated_length": 207.375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.8426489577568714, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.04747556382790208, + "learning_rate": 1.434650267062705e-05, + "loss": 0.0019, + "num_tokens": 38020326.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 396.375, + "completions/mean_terminated_length": 396.375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.8428334255672385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.051590414019301534, + "learning_rate": 1.4343602664467757e-05, + "loss": 0.0021, + "num_tokens": 38033769.0, + "reward": 1.0178570747375488, + "reward_std": 0.05050762742757797, + "rewards/fixed_code_pass_all_test_reward/mean": 0.01785714365541935, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 312.5, + "completions/mean_terminated_length": 312.5, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.8430178933776056, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.05273324251174927, + "learning_rate": 1.434070220799262e-05, + "loss": 0.0021, + "num_tokens": 38042853.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 318.25, + "completions/mean_terminated_length": 318.25, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.8432023611879726, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.05033519444987178, + "learning_rate": 1.4337801301502348e-05, + "loss": 0.002, + "num_tokens": 38050951.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 194.625, + "completions/mean_terminated_length": 194.625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.8433868289983398, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.03563853702507913, + "learning_rate": 1.433489994529768e-05, + "loss": 0.0014, + "num_tokens": 38055340.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 161.25, + "completions/mean_terminated_length": 161.25, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.8435712968087069, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.671875, + "kl": 0.06850841618143022, + "learning_rate": 1.4331998139679416e-05, + "loss": 0.0027, + "num_tokens": 38059582.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 275.125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.843755764619074, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044189453125, + "kl": 0.04532699077390134, + "learning_rate": 1.4329095884948394e-05, + "loss": 0.0018, + "num_tokens": 38069327.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 488.625, + "completions/mean_terminated_length": 488.625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.843940232429441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.828125, + "kl": 0.048503436613827944, + "learning_rate": 1.4326193181405497e-05, + "loss": 0.0019, + "num_tokens": 38080540.0, + "reward": 1.7083333730697632, + "reward_std": 0.2136233150959015, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.2136233150959015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 262.375, + "completions/mean_terminated_length": 262.375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8441247002398081, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0703125, + "kl": 0.051132958848029375, + "learning_rate": 1.4323290029351662e-05, + "loss": 0.002, + "num_tokens": 38089535.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 422.875, + "completions/mean_terminated_length": 422.875, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.8443091680501752, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1484375, + "kl": 0.03595753642730415, + "learning_rate": 1.4320386429087868e-05, + "loss": 0.0014, + "num_tokens": 38098446.0, + "reward": 1.559999942779541, + "reward_std": 0.45544329285621643, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6850000023841858, + "rewards/fixed_code_pass_all_test_reward/std": 0.33101576566696167, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 378.5, + "completions/mean_terminated_length": 378.5, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.8444936358605424, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5, + "kl": 0.06097805546596646, + "learning_rate": 1.431748238091514e-05, + "loss": 0.0024, + "num_tokens": 38109810.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 370.5, + "completions/mean_terminated_length": 370.5, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.8446781036709095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.04442149959504604, + "learning_rate": 1.4314577885134548e-05, + "loss": 0.0018, + "num_tokens": 38120070.0, + "reward": 1.920212745666504, + "reward_std": 0.13103443384170532, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9202127456665039, + "rewards/fixed_code_pass_all_test_reward/std": 0.13103443384170532, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 437.625, + "completions/mean_terminated_length": 437.625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.8448625714812765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.020964991999790072, + "learning_rate": 1.4311672942047214e-05, + "loss": 0.0008, + "num_tokens": 38128883.0, + "reward": 1.808333396911621, + "reward_std": 0.108012355864048, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8083333373069763, + "rewards/fixed_code_pass_all_test_reward/std": 0.108012355864048, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 267.875, + "completions/mean_terminated_length": 267.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.8450470392916436, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.042093463242053986, + "learning_rate": 1.4308767551954306e-05, + "loss": 0.0017, + "num_tokens": 38133994.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 328.0, + "completions/mean_terminated_length": 328.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8452315071020107, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1376953125, + "kl": 0.049778236891143024, + "learning_rate": 1.4305861715157027e-05, + "loss": 0.002, + "num_tokens": 38142314.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 446.75, + "completions/mean_terminated_length": 446.75, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.8454159749123777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.03358162404038012, + "learning_rate": 1.4302955431956642e-05, + "loss": 0.0013, + "num_tokens": 38153328.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 313.5, + "completions/mean_terminated_length": 313.5, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8456004427227449, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.028915792470797896, + "learning_rate": 1.4300048702654455e-05, + "loss": 0.0012, + "num_tokens": 38160548.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 148.375, + "completions/mean_terminated_length": 148.375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.845784910533112, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.59375, + "kl": 0.03875794424675405, + "learning_rate": 1.4297141527551813e-05, + "loss": 0.0016, + "num_tokens": 38164479.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 362.25, + "completions/mean_terminated_length": 362.25, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8459693783434791, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.03222684934735298, + "learning_rate": 1.4294233906950113e-05, + "loss": 0.0013, + "num_tokens": 38172137.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 656.375, + "completions/mean_terminated_length": 656.375, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "epoch": 0.8461538461538461, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.72265625, + "kl": 0.01860711502376944, + "learning_rate": 1.4291325841150798e-05, + "loss": 0.0007, + "num_tokens": 38186540.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 436.875, + "completions/mean_terminated_length": 436.875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.8463383139642132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.0303362071281299, + "learning_rate": 1.4288417330455358e-05, + "loss": 0.0012, + "num_tokens": 38194907.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 177.75, + "completions/mean_terminated_length": 177.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8465227817745803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09375, + "kl": 0.04035550489788875, + "learning_rate": 1.4285508375165332e-05, + "loss": 0.0016, + "num_tokens": 38200969.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 211.875, + "completions/mean_terminated_length": 211.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8467072495849475, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052001953125, + "kl": 0.023657016921788454, + "learning_rate": 1.428259897558229e-05, + "loss": 0.0009, + "num_tokens": 38205752.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8468917173953145, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.609375, + "kl": 0.05741121247410774, + "learning_rate": 1.427968913200787e-05, + "loss": 0.0023, + "num_tokens": 38213978.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 328.875, + "completions/mean_terminated_length": 328.875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.8470761852056816, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.05841058096848428, + "learning_rate": 1.4276778844743739e-05, + "loss": 0.0023, + "num_tokens": 38224353.0, + "reward": 1.7750000953674316, + "reward_std": 0.3284160792827606, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7749999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.3284161388874054, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 291.875, + "completions/mean_terminated_length": 291.875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.8472606530160487, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.021851423487532884, + "learning_rate": 1.4273868114091621e-05, + "loss": 0.0009, + "num_tokens": 38230656.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8474451208264158, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6484375, + "kl": 0.048348452895879745, + "learning_rate": 1.4270956940353278e-05, + "loss": 0.0019, + "num_tokens": 38236677.0, + "reward": 1.0760869979858398, + "reward_std": 0.020126517862081528, + "rewards/fixed_code_pass_all_test_reward/mean": 0.07608695328235626, + "rewards/fixed_code_pass_all_test_reward/std": 0.020126523450016975, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 463.875, + "completions/mean_terminated_length": 463.875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.8476295886367828, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.027008824865333736, + "learning_rate": 1.4268045323830519e-05, + "loss": 0.0011, + "num_tokens": 38246164.0, + "reward": 1.6041667461395264, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7291666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 367.375, + "completions/mean_terminated_length": 367.375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.84781405644715, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.03559684753417969, + "learning_rate": 1.4265133264825209e-05, + "loss": 0.0014, + "num_tokens": 38256231.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8479985242575171, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.03747866733465344, + "learning_rate": 1.4262220763639244e-05, + "loss": 0.0015, + "num_tokens": 38262910.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 300.5, + "completions/mean_terminated_length": 300.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.8481829920678842, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031982421875, + "kl": 0.01759509032126516, + "learning_rate": 1.425930782057457e-05, + "loss": 0.0007, + "num_tokens": 38269866.0, + "reward": 1.2222222089767456, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2222222238779068, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 187.25, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8483674598782512, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2138671875, + "kl": 0.08471361128613353, + "learning_rate": 1.425639443593319e-05, + "loss": 0.0034, + "num_tokens": 38274988.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 305.25, + "completions/mean_terminated_length": 305.25, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.8485519276886183, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03515625, + "kl": 0.022619985742494464, + "learning_rate": 1.425348061001714e-05, + "loss": 0.0009, + "num_tokens": 38281078.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 252.5, + "completions/mean_terminated_length": 252.5, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.8487363954989854, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.05580732366070151, + "learning_rate": 1.4250566343128504e-05, + "loss": 0.0022, + "num_tokens": 38287570.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 111.375, + "completions/mean_terminated_length": 111.375, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.8489208633093526, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.06061607296578586, + "learning_rate": 1.4247651635569419e-05, + "loss": 0.0024, + "num_tokens": 38291333.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.8491053311197196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.05291810049675405, + "learning_rate": 1.424473648764206e-05, + "loss": 0.0021, + "num_tokens": 38300091.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 386.375, + "completions/mean_terminated_length": 386.375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.8492897989300867, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.03605956328101456, + "learning_rate": 1.4241820899648651e-05, + "loss": 0.0014, + "num_tokens": 38311030.0, + "reward": 1.6607143878936768, + "reward_std": 0.2850758135318756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6607142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.285075843334198, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 117.0, + "completions/mean_terminated_length": 117.0, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.8494742667404538, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4296875, + "kl": 0.05825456231832504, + "learning_rate": 1.4238904871891456e-05, + "loss": 0.0023, + "num_tokens": 38314934.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 294.375, + "completions/mean_terminated_length": 294.375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.8496587345508209, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.04353521764278412, + "learning_rate": 1.4235988404672795e-05, + "loss": 0.0017, + "num_tokens": 38324697.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 303.875, + "completions/mean_terminated_length": 303.875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.8498432023611879, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.049427412915974855, + "learning_rate": 1.4233071498295026e-05, + "loss": 0.002, + "num_tokens": 38333280.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 507.375, + "completions/mean_terminated_length": 507.375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.8500276701715551, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9765625, + "kl": 0.025019120890647173, + "learning_rate": 1.4230154153060555e-05, + "loss": 0.001, + "num_tokens": 38346723.0, + "reward": 1.8304924964904785, + "reward_std": 0.12414851039648056, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8304924368858337, + "rewards/fixed_code_pass_all_test_reward/std": 0.12414851784706116, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 516.875, + "completions/mean_terminated_length": 516.875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.8502121379819222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.66015625, + "kl": 0.0345127556938678, + "learning_rate": 1.4227236369271832e-05, + "loss": 0.0014, + "num_tokens": 38360930.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 245.875, + "completions/mean_terminated_length": 245.875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.8503966057922893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032470703125, + "kl": 0.02621616458054632, + "learning_rate": 1.4224318147231353e-05, + "loss": 0.001, + "num_tokens": 38367009.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 293.375, + "completions/mean_terminated_length": 293.375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.8505810736026563, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04931640625, + "kl": 0.01958166650729254, + "learning_rate": 1.422139948724166e-05, + "loss": 0.0008, + "num_tokens": 38372972.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 523.5, + "completions/mean_terminated_length": 523.5, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.8507655414130234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10546875, + "kl": 0.058367958292365074, + "learning_rate": 1.4218480389605345e-05, + "loss": 0.0023, + "num_tokens": 38388040.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 234.5, + "completions/mean_terminated_length": 234.5, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.8509500092233905, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.053669103886932135, + "learning_rate": 1.421556085462503e-05, + "loss": 0.0021, + "num_tokens": 38392788.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 111.625, + "completions/mean_terminated_length": 111.625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8511344770337577, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.296875, + "kl": 0.0692718462087214, + "learning_rate": 1.4212640882603406e-05, + "loss": 0.0028, + "num_tokens": 38396545.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 253.75, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.8513189448441247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.04387856216635555, + "learning_rate": 1.4209720473843187e-05, + "loss": 0.0018, + "num_tokens": 38406231.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 287.875, + "completions/mean_terminated_length": 287.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.8515034126544918, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.07112867990508676, + "learning_rate": 1.4206799628647145e-05, + "loss": 0.0028, + "num_tokens": 38412398.0, + "reward": 1.2430555820465088, + "reward_std": 0.2684386670589447, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2430555671453476, + "rewards/fixed_code_pass_all_test_reward/std": 0.2684386670589447, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 165.75, + "completions/mean_terminated_length": 165.75, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8516878804648589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.197265625, + "kl": 0.0843011059332639, + "learning_rate": 1.420387834731809e-05, + "loss": 0.0034, + "num_tokens": 38417492.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 285.125, + "completions/mean_terminated_length": 285.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.851872348275226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.043110474944114685, + "learning_rate": 1.4200956630158889e-05, + "loss": 0.0017, + "num_tokens": 38427757.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 166.875, + "completions/mean_terminated_length": 166.875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.852056816085593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1083984375, + "kl": 0.03616967354901135, + "learning_rate": 1.419803447747244e-05, + "loss": 0.0014, + "num_tokens": 38432036.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 343.75, + "completions/mean_terminated_length": 343.75, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.8522412838959601, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.05243367259390652, + "learning_rate": 1.4195111889561695e-05, + "loss": 0.0021, + "num_tokens": 38439162.0, + "reward": 1.9107142686843872, + "reward_std": 0.12518209218978882, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9107142686843872, + "rewards/fixed_code_pass_all_test_reward/std": 0.12518209218978882, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 748.875, + "completions/mean_terminated_length": 748.875, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "epoch": 0.8524257517063273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6328125, + "kl": 0.027471379144117236, + "learning_rate": 1.4192188866729643e-05, + "loss": 0.0011, + "num_tokens": 38456049.0, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 416.875, + "completions/mean_terminated_length": 416.875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.8526102195166944, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.040016493294388056, + "learning_rate": 1.4189265409279331e-05, + "loss": 0.0016, + "num_tokens": 38465272.0, + "reward": 1.859375, + "reward_std": 0.08010874688625336, + "rewards/fixed_code_pass_all_test_reward/mean": 0.859375, + "rewards/fixed_code_pass_all_test_reward/std": 0.08010874688625336, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 331.25, + "completions/mean_terminated_length": 331.25, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.8527946873270614, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1044921875, + "kl": 0.06577845476567745, + "learning_rate": 1.418634151751384e-05, + "loss": 0.0026, + "num_tokens": 38471394.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 320.875, + "completions/mean_terminated_length": 320.875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.8529791551374285, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.035824211314320564, + "learning_rate": 1.4183417191736301e-05, + "loss": 0.0014, + "num_tokens": 38478681.0, + "reward": 1.6875, + "reward_std": 0.40510135889053345, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6875, + "rewards/fixed_code_pass_all_test_reward/std": 0.4051014184951782, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 266.25, + "completions/mean_terminated_length": 266.25, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8531636229477956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.07212663046084344, + "learning_rate": 1.4180492432249885e-05, + "loss": 0.0029, + "num_tokens": 38487563.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 206.375, + "completions/mean_terminated_length": 206.375, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.8533480907581626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.181640625, + "kl": 0.0662439635489136, + "learning_rate": 1.4177567239357817e-05, + "loss": 0.0026, + "num_tokens": 38496142.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 247.0, + "completions/mean_terminated_length": 247.0, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.8535325585685298, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.154296875, + "kl": 0.058274308452382684, + "learning_rate": 1.4174641613363358e-05, + "loss": 0.0023, + "num_tokens": 38504438.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 291.875, + "completions/mean_terminated_length": 291.875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.8537170263788969, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.02619321981910616, + "learning_rate": 1.4171715554569816e-05, + "loss": 0.001, + "num_tokens": 38510445.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 389.375, + "completions/mean_terminated_length": 389.375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.853901494189264, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.46875, + "kl": 0.047740797279402614, + "learning_rate": 1.4168789063280553e-05, + "loss": 0.0019, + "num_tokens": 38518008.0, + "reward": 1.5612244606018066, + "reward_std": 0.08307758718729019, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5612244606018066, + "rewards/fixed_code_pass_all_test_reward/std": 0.08307760953903198, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 384.625, + "completions/mean_terminated_length": 384.625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.854085961999631, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05615234375, + "kl": 0.03802112769335508, + "learning_rate": 1.4165862139798958e-05, + "loss": 0.0015, + "num_tokens": 38527293.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 418.875, + "completions/mean_terminated_length": 418.875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.8542704298099981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0289306640625, + "kl": 0.034402198158204556, + "learning_rate": 1.4162934784428484e-05, + "loss": 0.0014, + "num_tokens": 38535332.0, + "reward": 1.7058823108673096, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7058823704719543, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 323.5, + "completions/mean_terminated_length": 323.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8544548976203652, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1953125, + "kl": 0.04122254904359579, + "learning_rate": 1.416000699747261e-05, + "loss": 0.0016, + "num_tokens": 38545824.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 340.75, + "completions/mean_terminated_length": 340.75, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.8546393654307324, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.052680740132927895, + "learning_rate": 1.415707877923488e-05, + "loss": 0.0021, + "num_tokens": 38555702.0, + "reward": 1.105769157409668, + "reward_std": 0.29916054010391235, + "rewards/fixed_code_pass_all_test_reward/mean": 0.10576923191547394, + "rewards/fixed_code_pass_all_test_reward/std": 0.29916059970855713, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 818.5, + "completions/mean_terminated_length": 818.5, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "epoch": 0.8548238332410995, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04296875, + "kl": 0.0287223911145702, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.0011, + "num_tokens": 38569594.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 403.875, + "completions/mean_terminated_length": 403.875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.8550083010514665, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.06830438645556569, + "learning_rate": 1.4151221050128193e-05, + "loss": 0.0027, + "num_tokens": 38580545.0, + "reward": 1.1354167461395264, + "reward_std": 0.1886538565158844, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1354166716337204, + "rewards/fixed_code_pass_all_test_reward/std": 0.1886538714170456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 327.375, + "completions/mean_terminated_length": 327.375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.8551927688618336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1025390625, + "kl": 0.05095339729450643, + "learning_rate": 1.4148291539866524e-05, + "loss": 0.002, + "num_tokens": 38590628.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 454.5, + "completions/mean_terminated_length": 454.5, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.8553772366722007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8984375, + "kl": 0.032174097606912255, + "learning_rate": 1.4145361599537577e-05, + "loss": 0.0013, + "num_tokens": 38599688.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 516.625, + "completions/mean_terminated_length": 516.625, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.8555617044825677, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.279296875, + "kl": 0.059816672233864665, + "learning_rate": 1.4142431229445106e-05, + "loss": 0.0024, + "num_tokens": 38611869.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8557461722929349, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.029067994095385075, + "learning_rate": 1.4139500429892916e-05, + "loss": 0.0012, + "num_tokens": 38622955.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 258.625, + "completions/mean_terminated_length": 258.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.855930640103302, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.052311800653114915, + "learning_rate": 1.4136569201184844e-05, + "loss": 0.0021, + "num_tokens": 38629056.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 232.0, + "completions/mean_terminated_length": 232.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.8561151079136691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.034833905403502285, + "learning_rate": 1.413363754362479e-05, + "loss": 0.0014, + "num_tokens": 38637208.0, + "reward": 1.8928570747375488, + "reward_std": 0.30304577946662903, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8928571343421936, + "rewards/fixed_code_pass_all_test_reward/std": 0.30304577946662903, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 401.5, + "completions/mean_terminated_length": 401.5, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.8562995757240361, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.03321625152602792, + "learning_rate": 1.4130705457516683e-05, + "loss": 0.0013, + "num_tokens": 38645412.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 414.625, + "completions/mean_terminated_length": 414.625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.8564840435344032, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73046875, + "kl": 0.03590543591417372, + "learning_rate": 1.4127772943164506e-05, + "loss": 0.0014, + "num_tokens": 38657305.0, + "reward": 1.78125, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.78125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 292.375, + "completions/mean_terminated_length": 292.375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.8566685113447703, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1875, + "kl": 0.029136902536265552, + "learning_rate": 1.4124840000872275e-05, + "loss": 0.0012, + "num_tokens": 38667604.0, + "reward": 1.6319444179534912, + "reward_std": 0.404654860496521, + "rewards/fixed_code_pass_all_test_reward/mean": 0.631944477558136, + "rewards/fixed_code_pass_all_test_reward/std": 0.40465492010116577, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 459.5, + "completions/mean_terminated_length": 459.5, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.8568529791551375, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.03150592965539545, + "learning_rate": 1.4121906630944069e-05, + "loss": 0.0013, + "num_tokens": 38676152.0, + "reward": 1.5113636255264282, + "reward_std": 0.1607060581445694, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5113636255264282, + "rewards/fixed_code_pass_all_test_reward/std": 0.16070608794689178, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 520.0, + "completions/mean_terminated_length": 520.0, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.8570374469655045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.67578125, + "kl": 0.01683203608263284, + "learning_rate": 1.4118972833683993e-05, + "loss": 0.0007, + "num_tokens": 38685984.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 301.875, + "completions/mean_terminated_length": 301.875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.8572219147758716, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.05156271671876311, + "learning_rate": 1.4116038609396203e-05, + "loss": 0.0021, + "num_tokens": 38694559.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 406.5, + "completions/mean_terminated_length": 406.5, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.8574063825862387, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.02123437717091292, + "learning_rate": 1.4113103958384903e-05, + "loss": 0.0008, + "num_tokens": 38701595.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 302.375, + "completions/mean_terminated_length": 302.375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8575908503966058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23046875, + "kl": 0.05607993807643652, + "learning_rate": 1.411016888095434e-05, + "loss": 0.0022, + "num_tokens": 38708430.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 373.375, + "completions/mean_terminated_length": 373.375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.8577753182069728, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045654296875, + "kl": 0.034819274209439754, + "learning_rate": 1.4107233377408797e-05, + "loss": 0.0014, + "num_tokens": 38719305.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 275.125, + "completions/mean_terminated_length": 275.125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.85795978601734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.07306134095415473, + "learning_rate": 1.4104297448052612e-05, + "loss": 0.0029, + "num_tokens": 38728410.0, + "reward": 1.4821428060531616, + "reward_std": 0.2850758135318756, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.2850758135318756, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 126.625, + "completions/mean_terminated_length": 126.625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.8581442538277071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.015275871934136376, + "learning_rate": 1.4101361093190162e-05, + "loss": 0.0006, + "num_tokens": 38732399.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 291.625, + "completions/mean_terminated_length": 291.625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.8583287216380742, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.02554186386987567, + "learning_rate": 1.409842431312587e-05, + "loss": 0.001, + "num_tokens": 38737668.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 261.5, + "completions/mean_terminated_length": 261.5, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.8585131894484412, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.04166206298395991, + "learning_rate": 1.40954871081642e-05, + "loss": 0.0017, + "num_tokens": 38745320.0, + "reward": 1.9596774578094482, + "reward_std": 0.0746629536151886, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9596773982048035, + "rewards/fixed_code_pass_all_test_reward/std": 0.07466292381286621, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 366.625, + "completions/mean_terminated_length": 366.625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.8586976572588083, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.0377315788064152, + "learning_rate": 1.409254947860966e-05, + "loss": 0.0015, + "num_tokens": 38752653.0, + "reward": 1.9956896305084229, + "reward_std": 0.01219149399548769, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9956896305084229, + "rewards/fixed_code_pass_all_test_reward/std": 0.012191502377390862, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 183.0, + "completions/mean_terminated_length": 183.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.8588821250691754, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.07363160327076912, + "learning_rate": 1.4089611424766808e-05, + "loss": 0.0029, + "num_tokens": 38756933.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 625.375, + "completions/mean_terminated_length": 625.375, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "epoch": 0.8590665928795426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.921875, + "kl": 0.04570416756905615, + "learning_rate": 1.4086672946940238e-05, + "loss": 0.0018, + "num_tokens": 38770040.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 248.625, + "completions/mean_terminated_length": 248.625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.8592510606899096, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5859375, + "kl": 0.022240989143028855, + "learning_rate": 1.4083734045434597e-05, + "loss": 0.0009, + "num_tokens": 38775549.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 428.5, + "completions/mean_terminated_length": 428.5, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.8594355285002767, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.859375, + "kl": 0.020981371984817088, + "learning_rate": 1.408079472055456e-05, + "loss": 0.0008, + "num_tokens": 38783249.0, + "reward": 1.953125, + "reward_std": 0.09300297498703003, + "rewards/fixed_code_pass_all_test_reward/mean": 0.953125, + "rewards/fixed_code_pass_all_test_reward/std": 0.09300298243761063, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 298.25, + "completions/mean_terminated_length": 298.25, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.8596199963106438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.03129088052082807, + "learning_rate": 1.4077854972604872e-05, + "loss": 0.0013, + "num_tokens": 38789619.0, + "reward": 1.6510417461395264, + "reward_std": 0.21355074644088745, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6510416865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.21355074644088745, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 398.375, + "completions/mean_terminated_length": 398.375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.8598044641210109, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09521484375, + "kl": 0.06676631979644299, + "learning_rate": 1.4074914801890294e-05, + "loss": 0.0027, + "num_tokens": 38799774.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 437.0, + "completions/mean_terminated_length": 437.0, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.8599889319313779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08984375, + "kl": 0.05240622255951166, + "learning_rate": 1.4071974208715653e-05, + "loss": 0.0021, + "num_tokens": 38810334.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 507.75, + "completions/mean_terminated_length": 507.75, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.8601733997417451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06982421875, + "kl": 0.03791411221027374, + "learning_rate": 1.40690331933858e-05, + "loss": 0.0015, + "num_tokens": 38822228.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 231.0, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.8603578675521122, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.0486620282754302, + "learning_rate": 1.4066091756205646e-05, + "loss": 0.0019, + "num_tokens": 38827676.0, + "reward": 1.96875, + "reward_std": 0.0883883461356163, + "rewards/fixed_code_pass_all_test_reward/mean": 0.96875, + "rewards/fixed_code_pass_all_test_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 346.625, + "completions/mean_terminated_length": 346.625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.8605423353624793, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.04177918133791536, + "learning_rate": 1.4063149897480139e-05, + "loss": 0.0017, + "num_tokens": 38837497.0, + "reward": 1.875, + "reward_std": 0.2314550280570984, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.2314550280570984, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 299.625, + "completions/mean_terminated_length": 299.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.8607268031728463, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20703125, + "kl": 0.04682913247961551, + "learning_rate": 1.4060207617514275e-05, + "loss": 0.0019, + "num_tokens": 38844182.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 305.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.8609112709832134, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.453125, + "kl": 0.11341713066212833, + "learning_rate": 1.4057264916613078e-05, + "loss": 0.0045, + "num_tokens": 38854027.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 334.25, + "completions/mean_terminated_length": 334.25, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8610957387935805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0634765625, + "kl": 0.03464405215345323, + "learning_rate": 1.4054321795081643e-05, + "loss": 0.0014, + "num_tokens": 38865557.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 405.625, + "completions/mean_terminated_length": 405.625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.8612802066039477, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.048946703085675836, + "learning_rate": 1.405137825322508e-05, + "loss": 0.002, + "num_tokens": 38875098.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 371.375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8614646744143147, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2109375, + "kl": 0.04572533001191914, + "learning_rate": 1.4048434291348567e-05, + "loss": 0.0018, + "num_tokens": 38882053.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 396.0, + "completions/mean_terminated_length": 396.0, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.8616491422246818, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.0394243742339313, + "learning_rate": 1.4045489909757307e-05, + "loss": 0.0016, + "num_tokens": 38891677.0, + "reward": 1.65625, + "reward_std": 0.27294278144836426, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.27294281125068665, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 645.125, + "completions/mean_terminated_length": 645.125, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.8618336100350489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8515625, + "kl": 0.022074261447414756, + "learning_rate": 1.4042545108756558e-05, + "loss": 0.0009, + "num_tokens": 38908086.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 419.875, + "completions/mean_terminated_length": 419.875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.862018077845416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.0434700867626816, + "learning_rate": 1.4039599888651614e-05, + "loss": 0.0017, + "num_tokens": 38918125.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 369.875, + "completions/mean_terminated_length": 369.875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.862202545655783, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.34375, + "kl": 0.03971857321448624, + "learning_rate": 1.4036654249747817e-05, + "loss": 0.0016, + "num_tokens": 38925548.0, + "reward": 1.9166667461395264, + "reward_std": 0.23570223152637482, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9166666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.2357022762298584, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 179.5, + "completions/mean_terminated_length": 179.5, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.8623870134661502, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050048828125, + "kl": 0.027675921679474413, + "learning_rate": 1.403370819235055e-05, + "loss": 0.0011, + "num_tokens": 38929752.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 261.625, + "completions/mean_terminated_length": 261.625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.8625714812765173, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.53125, + "kl": 0.22531674487981945, + "learning_rate": 1.4030761716765246e-05, + "loss": 0.009, + "num_tokens": 38935525.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 307.5, + "completions/mean_terminated_length": 307.5, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8627559490868844, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.02901852980721742, + "learning_rate": 1.402781482329737e-05, + "loss": 0.0012, + "num_tokens": 38944401.0, + "reward": 1.2893518209457397, + "reward_std": 0.5244312882423401, + "rewards/fixed_code_pass_all_test_reward/mean": 0.41435185074806213, + "rewards/fixed_code_pass_all_test_reward/std": 0.17788177728652954, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.8629404168972514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09716796875, + "kl": 0.026566158689092845, + "learning_rate": 1.4024867512252438e-05, + "loss": 0.0011, + "num_tokens": 38951060.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 366.375, + "completions/mean_terminated_length": 366.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8631248847076185, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.04402469424530864, + "learning_rate": 1.4021919783936008e-05, + "loss": 0.0018, + "num_tokens": 38958223.0, + "reward": 1.1770833730697632, + "reward_std": 0.337910920381546, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1770833432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.33791089057922363, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 146.625, + "completions/mean_terminated_length": 146.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.8633093525179856, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.09375, + "kl": 0.08293632697314024, + "learning_rate": 1.4018971638653684e-05, + "loss": 0.0033, + "num_tokens": 38962124.0, + "reward": 0.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 4680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 388.875, + "completions/mean_terminated_length": 388.875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.8634938203283528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.0634770910255611, + "learning_rate": 1.4016023076711106e-05, + "loss": 0.0025, + "num_tokens": 38971419.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 348.625, + "completions/mean_terminated_length": 348.625, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8636782881387198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041015625, + "kl": 0.0472181998193264, + "learning_rate": 1.4013074098413962e-05, + "loss": 0.0019, + "num_tokens": 38981288.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 448.25, + "completions/mean_terminated_length": 448.25, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.8638627559490869, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96484375, + "kl": 0.029731591464951634, + "learning_rate": 1.4010124704067983e-05, + "loss": 0.0012, + "num_tokens": 38989458.0, + "reward": 1.5340909957885742, + "reward_std": 0.3601700961589813, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5340908765792847, + "rewards/fixed_code_pass_all_test_reward/std": 0.3601701557636261, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 638.75, + "completions/mean_terminated_length": 638.75, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.864047223759454, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.0362716494128108, + "learning_rate": 1.4007174893978941e-05, + "loss": 0.0015, + "num_tokens": 39002464.0, + "reward": 1.9696969985961914, + "reward_std": 0.08570989966392517, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9696969985961914, + "rewards/fixed_code_pass_all_test_reward/std": 0.08570991456508636, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 347.25, + "completions/mean_terminated_length": 347.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.864231691569821, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3359375, + "kl": 0.06427385751157999, + "learning_rate": 1.4004224668452657e-05, + "loss": 0.0026, + "num_tokens": 39011818.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 458.0, + "completions/mean_terminated_length": 458.0, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.8644161593801881, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69921875, + "kl": 0.019109230604954064, + "learning_rate": 1.4001274027794983e-05, + "loss": 0.0008, + "num_tokens": 39022530.0, + "reward": 1.4409723281860352, + "reward_std": 0.22588132321834564, + "rewards/fixed_code_pass_all_test_reward/mean": 0.440972238779068, + "rewards/fixed_code_pass_all_test_reward/std": 0.22588135302066803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 160.875, + "completions/mean_terminated_length": 160.875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.8646006271905552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1298828125, + "kl": 0.0596800297498703, + "learning_rate": 1.399832297231183e-05, + "loss": 0.0024, + "num_tokens": 39026569.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 472.5, + "completions/mean_terminated_length": 472.5, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.8647850950009224, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.029860817012377083, + "learning_rate": 1.399537150230914e-05, + "loss": 0.0012, + "num_tokens": 39035477.0, + "reward": 1.8125, + "reward_std": 0.3482097089290619, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3482097089290619, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 408.75, + "completions/mean_terminated_length": 408.75, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.8649695628112894, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.03785516903735697, + "learning_rate": 1.3992419618092903e-05, + "loss": 0.0015, + "num_tokens": 39043427.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 166.625, + "completions/mean_terminated_length": 166.625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.8651540306216565, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.08576841419562697, + "learning_rate": 1.398946731996915e-05, + "loss": 0.0034, + "num_tokens": 39047592.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 466.5, + "completions/mean_terminated_length": 466.5, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.8653384984320236, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1953125, + "kl": 0.03537092311307788, + "learning_rate": 1.3986514608243957e-05, + "loss": 0.0014, + "num_tokens": 39059052.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 111.5, + "completions/mean_terminated_length": 111.5, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.8655229662423907, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1181640625, + "kl": 0.04479034268297255, + "learning_rate": 1.3983561483223438e-05, + "loss": 0.0018, + "num_tokens": 39062720.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 178.375, + "completions/mean_terminated_length": 178.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.8657074340527577, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0458984375, + "kl": 0.02328692434821278, + "learning_rate": 1.3980607945213756e-05, + "loss": 0.0009, + "num_tokens": 39066979.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 565.625, + "completions/mean_terminated_length": 565.625, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.8658919018631249, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.0344025946687907, + "learning_rate": 1.3977653994521112e-05, + "loss": 0.0014, + "num_tokens": 39077368.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.866076369673492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.02852268482092768, + "learning_rate": 1.3974699631451759e-05, + "loss": 0.0011, + "num_tokens": 39083498.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.8662608374838591, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07568359375, + "kl": 0.023713955422863364, + "learning_rate": 1.3971744856311975e-05, + "loss": 0.0009, + "num_tokens": 39087903.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 335.0, + "completions/mean_terminated_length": 335.0, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.8664453052942261, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12890625, + "kl": 0.028498757048510015, + "learning_rate": 1.3968789669408098e-05, + "loss": 0.0011, + "num_tokens": 39093815.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 226.125, + "completions/mean_terminated_length": 226.125, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.8666297731045932, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.04299801285378635, + "learning_rate": 1.3965834071046502e-05, + "loss": 0.0017, + "num_tokens": 39102344.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 264.875, + "completions/mean_terminated_length": 264.875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.8668142409149603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0498046875, + "kl": 0.031449089176021516, + "learning_rate": 1.3962878061533602e-05, + "loss": 0.0013, + "num_tokens": 39111599.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 632.5, + "completions/mean_terminated_length": 632.5, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "epoch": 0.8669987087253275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11865234375, + "kl": 0.0421741446480155, + "learning_rate": 1.395992164117586e-05, + "loss": 0.0017, + "num_tokens": 39123179.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 266.875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.8671831765356945, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.02783536654897034, + "learning_rate": 1.3956964810279775e-05, + "loss": 0.0011, + "num_tokens": 39131290.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 210.25, + "completions/mean_terminated_length": 210.25, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.8673676443460616, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062255859375, + "kl": 0.02601873315870762, + "learning_rate": 1.3954007569151893e-05, + "loss": 0.001, + "num_tokens": 39136316.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 200.25, + "completions/mean_terminated_length": 200.25, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.8675521121564287, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.140625, + "kl": 0.03543301299214363, + "learning_rate": 1.39510499180988e-05, + "loss": 0.0014, + "num_tokens": 39145190.0, + "reward": 1.8774752616882324, + "reward_std": 0.346552312374115, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8774752616882324, + "rewards/fixed_code_pass_all_test_reward/std": 0.3465523421764374, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 231.0, + "completions/mean_terminated_length": 231.0, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.8677365799667958, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06591796875, + "kl": 0.031419944134540856, + "learning_rate": 1.3948091857427126e-05, + "loss": 0.0013, + "num_tokens": 39150086.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 268.625, + "completions/mean_terminated_length": 268.625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.8679210477771628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.04398674704134464, + "learning_rate": 1.3945133387443544e-05, + "loss": 0.0018, + "num_tokens": 39158563.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 591.625, + "completions/mean_terminated_length": 591.625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.86810551558753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.671875, + "kl": 0.015890662907622755, + "learning_rate": 1.3942174508454768e-05, + "loss": 0.0006, + "num_tokens": 39168232.0, + "reward": 1.9642857313156128, + "reward_std": 0.10101523250341415, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 163.25, + "completions/mean_terminated_length": 163.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8682899833978971, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.044706322136335075, + "learning_rate": 1.3939215220767557e-05, + "loss": 0.0018, + "num_tokens": 39172218.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 245.125, + "completions/mean_terminated_length": 245.125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.8684744512082642, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.146484375, + "kl": 0.04174162331037223, + "learning_rate": 1.3936255524688707e-05, + "loss": 0.0017, + "num_tokens": 39181339.0, + "reward": 1.2999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.30000001192092896, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 153.5, + "completions/mean_terminated_length": 153.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.8686589190186312, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.109375, + "kl": 0.06961822230368853, + "learning_rate": 1.3933295420525059e-05, + "loss": 0.0028, + "num_tokens": 39185407.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 465.5, + "completions/mean_terminated_length": 465.5, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.8688433868289983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8359375, + "kl": 0.0350196931976825, + "learning_rate": 1.3930334908583498e-05, + "loss": 0.0014, + "num_tokens": 39198059.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 448.5, + "completions/mean_terminated_length": 448.5, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.8690278546393654, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2421875, + "kl": 0.0317831733264029, + "learning_rate": 1.3927373989170955e-05, + "loss": 0.0013, + "num_tokens": 39210791.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 329.125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.8692123224497326, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.03357195947319269, + "learning_rate": 1.392441266259439e-05, + "loss": 0.0013, + "num_tokens": 39223720.0, + "reward": 1.5663264989852905, + "reward_std": 0.10101527720689774, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5663264989852905, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101527720689774, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.8693967902600996, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.03081578010460362, + "learning_rate": 1.392145092916082e-05, + "loss": 0.0012, + "num_tokens": 39228857.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 159.375, + "completions/mean_terminated_length": 159.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.8695812580704667, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.375, + "kl": 0.05426775990054011, + "learning_rate": 1.3918488789177298e-05, + "loss": 0.0022, + "num_tokens": 39232980.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 197.75, + "completions/mean_terminated_length": 197.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.8697657258808338, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0654296875, + "kl": 0.058262263890355825, + "learning_rate": 1.3915526242950915e-05, + "loss": 0.0023, + "num_tokens": 39240370.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 263.375, + "completions/mean_terminated_length": 263.375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8699501936912009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.058837890625, + "kl": 0.030521248700097203, + "learning_rate": 1.3912563290788808e-05, + "loss": 0.0012, + "num_tokens": 39246613.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 225.375, + "completions/mean_terminated_length": 225.375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.8701346615015679, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.65234375, + "kl": 0.05464904918335378, + "learning_rate": 1.3909599932998159e-05, + "loss": 0.0022, + "num_tokens": 39251112.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.8703191293119351, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.06137406104244292, + "learning_rate": 1.390663616988619e-05, + "loss": 0.0025, + "num_tokens": 39255130.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 278.0, + "completions/mean_terminated_length": 278.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8705035971223022, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057861328125, + "kl": 0.030905081424862146, + "learning_rate": 1.390367200176016e-05, + "loss": 0.0012, + "num_tokens": 39263522.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 508.125, + "completions/mean_terminated_length": 508.125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.8706880649326693, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.02643391815945506, + "learning_rate": 1.3900707428927376e-05, + "loss": 0.0011, + "num_tokens": 39272891.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 312.0, + "completions/mean_terminated_length": 312.0, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8708725327430363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.037322524236515164, + "learning_rate": 1.3897742451695187e-05, + "loss": 0.0015, + "num_tokens": 39280091.0, + "reward": 1.9017857313156128, + "reward_std": 0.18185748159885406, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9017857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.18185752630233765, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 488.0, + "completions/mean_terminated_length": 265.14288330078125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.8710570005534034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.053849385818466544, + "learning_rate": 1.3894777070370984e-05, + "loss": 0.0022, + "num_tokens": 39287483.0, + "reward": 1.625, + "reward_std": 0.7440237998962402, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 141.0, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.8712414683637705, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.06663461215794086, + "learning_rate": 1.389181128526219e-05, + "loss": 0.0027, + "num_tokens": 39291323.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 685.25, + "completions/mean_terminated_length": 685.25, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "epoch": 0.8714259361741377, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2890625, + "kl": 0.0304770766524598, + "learning_rate": 1.3888845096676286e-05, + "loss": 0.0012, + "num_tokens": 39303389.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 184.75, + "completions/mean_terminated_length": 184.75, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.8716104039845047, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10986328125, + "kl": 0.0554989711381495, + "learning_rate": 1.3885878504920785e-05, + "loss": 0.0022, + "num_tokens": 39309547.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 218.25, + "completions/mean_terminated_length": 218.25, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.8717948717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.047845219960436225, + "learning_rate": 1.3882911510303241e-05, + "loss": 0.0019, + "num_tokens": 39314309.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 231.875, + "completions/mean_terminated_length": 231.875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.8719793396052389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1123046875, + "kl": 0.05718048848211765, + "learning_rate": 1.3879944113131251e-05, + "loss": 0.0023, + "num_tokens": 39320164.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 322.0, + "completions/mean_terminated_length": 322.0, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.872163807415606, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.052743843058124185, + "learning_rate": 1.3876976313712457e-05, + "loss": 0.0021, + "num_tokens": 39327476.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.872348275225973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.021337245707400143, + "learning_rate": 1.3874008112354545e-05, + "loss": 0.0009, + "num_tokens": 39334034.0, + "reward": 1.899999976158142, + "reward_std": 0.18516401946544647, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.18516401946544647, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 222.5, + "completions/mean_terminated_length": 222.5, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8725327430363402, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1650390625, + "kl": 0.055611724965274334, + "learning_rate": 1.3871039509365235e-05, + "loss": 0.0022, + "num_tokens": 39341750.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 319.125, + "completions/mean_terminated_length": 319.125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.8727172108467073, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.79296875, + "kl": 0.06065567955374718, + "learning_rate": 1.3868070505052287e-05, + "loss": 0.0024, + "num_tokens": 39351071.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 403.375, + "completions/mean_terminated_length": 403.375, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.8729016786570744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.03125, + "kl": 0.027958056307397783, + "learning_rate": 1.3865101099723515e-05, + "loss": 0.0011, + "num_tokens": 39361082.0, + "reward": 1.9765625, + "reward_std": 0.06629125773906708, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9765625, + "rewards/fixed_code_pass_all_test_reward/std": 0.06629125773906708, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 187.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.8730861464674414, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3671875, + "kl": 0.03316507791168988, + "learning_rate": 1.3862131293686762e-05, + "loss": 0.0013, + "num_tokens": 39369120.0, + "reward": 1.6488094329833984, + "reward_std": 0.45882448554039, + "rewards/fixed_code_pass_all_test_reward/mean": 0.648809552192688, + "rewards/fixed_code_pass_all_test_reward/std": 0.4588245153427124, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 382.875, + "completions/mean_terminated_length": 382.875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.8732706142778085, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.036426066188141704, + "learning_rate": 1.3859161087249924e-05, + "loss": 0.0015, + "num_tokens": 39379863.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 177.0, + "completions/mean_terminated_length": 177.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.8734550820881756, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.103515625, + "kl": 0.04740915121510625, + "learning_rate": 1.3856190480720926e-05, + "loss": 0.0019, + "num_tokens": 39386615.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 269.25, + "completions/mean_terminated_length": 269.25, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.8736395498985428, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4765625, + "kl": 0.035562718054279685, + "learning_rate": 1.3853219474407741e-05, + "loss": 0.0014, + "num_tokens": 39392177.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 403.625, + "completions/mean_terminated_length": 403.625, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.8738240177089098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7421875, + "kl": 0.02128627779893577, + "learning_rate": 1.3850248068618388e-05, + "loss": 0.0009, + "num_tokens": 39400214.0, + "reward": 1.3409090042114258, + "reward_std": 0.09409989416599274, + "rewards/fixed_code_pass_all_test_reward/mean": 0.34090909361839294, + "rewards/fixed_code_pass_all_test_reward/std": 0.09409984946250916, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 577.25, + "completions/mean_terminated_length": 577.25, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.8740084855192769, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0224609375, + "kl": 0.012722728541120887, + "learning_rate": 1.3847276263660922e-05, + "loss": 0.0005, + "num_tokens": 39410264.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 386.875, + "completions/mean_terminated_length": 386.875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.874192953329644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.875, + "kl": 0.0531512814341113, + "learning_rate": 1.3844304059843435e-05, + "loss": 0.0021, + "num_tokens": 39419263.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 428.375, + "completions/mean_terminated_length": 428.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.874377421140011, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04443359375, + "kl": 0.024918338982388377, + "learning_rate": 1.3841331457474067e-05, + "loss": 0.001, + "num_tokens": 39429994.0, + "reward": 1.7469879388809204, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7469879388809204, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 328.5, + "completions/mean_terminated_length": 328.5, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.8745618889503781, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039794921875, + "kl": 0.03135993366595358, + "learning_rate": 1.3838358456861e-05, + "loss": 0.0013, + "num_tokens": 39437430.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 202.0, + "completions/mean_terminated_length": 202.0, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.8747463567607453, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2158203125, + "kl": 0.05347280763089657, + "learning_rate": 1.3835385058312456e-05, + "loss": 0.0021, + "num_tokens": 39442878.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 271.375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.8749308245711124, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.03216646471992135, + "learning_rate": 1.3832411262136692e-05, + "loss": 0.0013, + "num_tokens": 39452073.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 215.75, + "completions/mean_terminated_length": 215.75, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.8751152923814794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05419921875, + "kl": 0.02850513276644051, + "learning_rate": 1.3829437068642013e-05, + "loss": 0.0011, + "num_tokens": 39456783.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 391.0, + "completions/mean_terminated_length": 391.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8752997601918465, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.93359375, + "kl": 0.06560150324366987, + "learning_rate": 1.3826462478136768e-05, + "loss": 0.0026, + "num_tokens": 39467023.0, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 484.625, + "completions/mean_terminated_length": 484.625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.8754842280022136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8671875, + "kl": 0.015738442889414728, + "learning_rate": 1.382348749092934e-05, + "loss": 0.0006, + "num_tokens": 39480700.0, + "reward": 1.9276859760284424, + "reward_std": 0.01753157190978527, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9276859760284424, + "rewards/fixed_code_pass_all_test_reward/std": 0.01753157190978527, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 264.25, + "completions/mean_terminated_length": 264.25, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.8756686958125807, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.04961149860173464, + "learning_rate": 1.3820512107328152e-05, + "loss": 0.002, + "num_tokens": 39489542.0, + "reward": 1.1508620977401733, + "reward_std": 0.3506588041782379, + "rewards/fixed_code_pass_all_test_reward/mean": 0.15086206793785095, + "rewards/fixed_code_pass_all_test_reward/std": 0.3506588339805603, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 463.875, + "completions/mean_terminated_length": 463.875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.8758531636229479, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90234375, + "kl": 0.028414310072548687, + "learning_rate": 1.3817536327641678e-05, + "loss": 0.0011, + "num_tokens": 39500637.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 332.0, + "completions/mean_terminated_length": 332.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.8760376314333149, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9453125, + "kl": 0.013821726315654814, + "learning_rate": 1.3814560152178426e-05, + "loss": 0.0006, + "num_tokens": 39507445.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1711.0, + "completions/max_terminated_length": 1711.0, + "completions/mean_length": 789.5, + "completions/mean_terminated_length": 789.5, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.876222099243682, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1171875, + "kl": 0.03998230630531907, + "learning_rate": 1.3811583581246941e-05, + "loss": 0.0016, + "num_tokens": 39520825.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 377.0, + "completions/mean_terminated_length": 377.0, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.8764065670540491, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.0433138650842011, + "learning_rate": 1.3808606615155821e-05, + "loss": 0.0017, + "num_tokens": 39533897.0, + "reward": 1.648809552192688, + "reward_std": 0.11921755224466324, + "rewards/fixed_code_pass_all_test_reward/mean": 0.648809552192688, + "rewards/fixed_code_pass_all_test_reward/std": 0.11921756714582443, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 326.375, + "completions/mean_terminated_length": 326.375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.8765910348644161, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.90625, + "kl": 0.041911221109330654, + "learning_rate": 1.3805629254213693e-05, + "loss": 0.0017, + "num_tokens": 39541180.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 261.375, + "completions/mean_terminated_length": 261.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.8767755026747832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.03478107205592096, + "learning_rate": 1.3802651498729234e-05, + "loss": 0.0014, + "num_tokens": 39550479.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 422.5, + "completions/mean_terminated_length": 422.5, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.8769599704851503, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0791015625, + "kl": 0.04243451589718461, + "learning_rate": 1.3799673349011153e-05, + "loss": 0.0017, + "num_tokens": 39560971.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 417.75, + "completions/mean_terminated_length": 417.75, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.8771444382955175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.96875, + "kl": 0.027786601800471544, + "learning_rate": 1.379669480536821e-05, + "loss": 0.0011, + "num_tokens": 39573289.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 209.375, + "completions/mean_terminated_length": 209.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.8773289061058845, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.8984375, + "kl": 0.04395279474556446, + "learning_rate": 1.3793715868109195e-05, + "loss": 0.0018, + "num_tokens": 39578860.0, + "reward": 1.8985848426818848, + "reward_std": 0.18778426945209503, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8985849022865295, + "rewards/fixed_code_pass_all_test_reward/std": 0.18778428435325623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 414.875, + "completions/mean_terminated_length": 414.875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.8775133739162516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.04496568674221635, + "learning_rate": 1.3790736537542948e-05, + "loss": 0.0018, + "num_tokens": 39588171.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 456.375, + "completions/mean_terminated_length": 456.375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.8776978417266187, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.71875, + "kl": 0.02382062072865665, + "learning_rate": 1.3787756813978349e-05, + "loss": 0.001, + "num_tokens": 39597534.0, + "reward": 1.399999976158142, + "reward_std": 0.25657081604003906, + "rewards/fixed_code_pass_all_test_reward/mean": 0.3999999761581421, + "rewards/fixed_code_pass_all_test_reward/std": 0.25657081604003906, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 244.25, + "completions/mean_terminated_length": 244.25, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8778823095369858, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.625, + "kl": 0.039692720863968134, + "learning_rate": 1.3784776697724307e-05, + "loss": 0.0016, + "num_tokens": 39602288.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 330.625, + "completions/mean_terminated_length": 330.625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.8780667773473528, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1640625, + "kl": 0.03313214611262083, + "learning_rate": 1.3781796189089788e-05, + "loss": 0.0013, + "num_tokens": 39612845.0, + "reward": 1.974662184715271, + "reward_std": 0.07166623324155807, + "rewards/fixed_code_pass_all_test_reward/mean": 0.974662184715271, + "rewards/fixed_code_pass_all_test_reward/std": 0.07166622579097748, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 363.875, + "completions/mean_terminated_length": 363.875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.87825124515772, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.04306380683556199, + "learning_rate": 1.3778815288383794e-05, + "loss": 0.0017, + "num_tokens": 39623324.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 222.25, + "completions/mean_terminated_length": 222.25, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.8784357129680871, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.04802516894415021, + "learning_rate": 1.3775833995915356e-05, + "loss": 0.0019, + "num_tokens": 39630110.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 289.625, + "completions/mean_terminated_length": 289.625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8786201807784542, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.037016161950305104, + "learning_rate": 1.3772852311993561e-05, + "loss": 0.0015, + "num_tokens": 39639563.0, + "reward": 1.7999999523162842, + "reward_std": 0.38544961810112, + "rewards/fixed_code_pass_all_test_reward/mean": 0.925000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.2121320217847824, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 223.625, + "completions/mean_terminated_length": 223.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.8788046485888212, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703125, + "kl": 0.08206583792343736, + "learning_rate": 1.3769870236927526e-05, + "loss": 0.0033, + "num_tokens": 39648440.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 383.625, + "completions/mean_terminated_length": 383.625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.8789891163991883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04052734375, + "kl": 0.02462981583084911, + "learning_rate": 1.3766887771026417e-05, + "loss": 0.001, + "num_tokens": 39664085.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 161.875, + "completions/mean_terminated_length": 161.875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.8791735842095554, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.125, + "kl": 0.07272317353636026, + "learning_rate": 1.3763904914599434e-05, + "loss": 0.0029, + "num_tokens": 39668172.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 350.875, + "completions/mean_terminated_length": 350.875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.8793580520199226, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046630859375, + "kl": 0.03064386628102511, + "learning_rate": 1.3760921667955818e-05, + "loss": 0.0012, + "num_tokens": 39674635.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 252.0, + "completions/mean_terminated_length": 252.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.8795425198302896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5234375, + "kl": 0.04086106293834746, + "learning_rate": 1.3757938031404856e-05, + "loss": 0.0016, + "num_tokens": 39680859.0, + "reward": 1.0225000381469727, + "reward_std": 0.036154430359601974, + "rewards/fixed_code_pass_all_test_reward/mean": 0.022499999031424522, + "rewards/fixed_code_pass_all_test_reward/std": 0.036154430359601974, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 95.0, + "completions/mean_terminated_length": 95.0, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.8797269876406567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052978515625, + "kl": 0.018019924114923924, + "learning_rate": 1.3754954005255869e-05, + "loss": 0.0007, + "num_tokens": 39684315.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 392.125, + "completions/mean_terminated_length": 392.125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8799114554510238, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.21875, + "kl": 0.04209405032452196, + "learning_rate": 1.3751969589818221e-05, + "loss": 0.0017, + "num_tokens": 39692124.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 115.0, + "completions/max_terminated_length": 115.0, + "completions/mean_length": 92.625, + "completions/mean_terminated_length": 92.625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.8800959232613909, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.0858722566626966, + "learning_rate": 1.3748984785401318e-05, + "loss": 0.0034, + "num_tokens": 39695657.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 441.625, + "completions/mean_terminated_length": 441.625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.8802803910717579, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04833984375, + "kl": 0.02058216172736138, + "learning_rate": 1.3745999592314605e-05, + "loss": 0.0008, + "num_tokens": 39704054.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 288.25, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.8804648588821251, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.046875, + "kl": 0.03259724169038236, + "learning_rate": 1.3743014010867564e-05, + "loss": 0.0013, + "num_tokens": 39710944.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.625, + "rewards/format_reward/std": 0.5175492167472839, + "step": 4773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.8806493266924922, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.578125, + "kl": 0.04258326534181833, + "learning_rate": 1.374002804136972e-05, + "loss": 0.0017, + "num_tokens": 39719351.0, + "reward": 1.3161765336990356, + "reward_std": 0.42989787459373474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.31617647409439087, + "rewards/fixed_code_pass_all_test_reward/std": 0.42989784479141235, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 190.125, + "completions/mean_terminated_length": 190.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8808337945028593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056640625, + "kl": 0.0331149403937161, + "learning_rate": 1.373704168413064e-05, + "loss": 0.0013, + "num_tokens": 39723864.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 217.625, + "completions/mean_terminated_length": 217.625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.8810182623132263, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4609375, + "kl": 0.049930619308725, + "learning_rate": 1.3734054939459936e-05, + "loss": 0.002, + "num_tokens": 39729485.0, + "reward": 1.5499999523162842, + "reward_std": 0.3726353943347931, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5499999523162842, + "rewards/fixed_code_pass_all_test_reward/std": 0.3726354241371155, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 411.25, + "completions/mean_terminated_length": 411.25, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.8812027301235934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.349609375, + "kl": 0.03233512002043426, + "learning_rate": 1.3731067807667243e-05, + "loss": 0.0013, + "num_tokens": 39741823.0, + "reward": 1.649193525314331, + "reward_std": 0.2623138129711151, + "rewards/fixed_code_pass_all_test_reward/mean": 0.649193525314331, + "rewards/fixed_code_pass_all_test_reward/std": 0.2623138129711151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.8813871979339605, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1875, + "kl": 0.06477592792361975, + "learning_rate": 1.3728080289062252e-05, + "loss": 0.0026, + "num_tokens": 39745873.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8815716657443277, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.035850972635671496, + "learning_rate": 1.3725092383954688e-05, + "loss": 0.0014, + "num_tokens": 39754861.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 231.125, + "completions/mean_terminated_length": 231.125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8817561335546947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1728515625, + "kl": 0.09666113369166851, + "learning_rate": 1.3722104092654318e-05, + "loss": 0.0039, + "num_tokens": 39760270.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 310.625, + "completions/mean_terminated_length": 310.625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8819406013650618, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.06950349593535066, + "learning_rate": 1.371911541547095e-05, + "loss": 0.0028, + "num_tokens": 39769475.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 156.75, + "completions/mean_terminated_length": 156.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8821250691754289, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08544921875, + "kl": 0.077410111669451, + "learning_rate": 1.3716126352714428e-05, + "loss": 0.0031, + "num_tokens": 39773777.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 1076.125, + "completions/mean_terminated_length": 1076.125, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.882309536985796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.384765625, + "kl": 0.014730526134371758, + "learning_rate": 1.3713136904694637e-05, + "loss": 0.0006, + "num_tokens": 39794506.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.882494004796163, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.057129884604364634, + "learning_rate": 1.3710147071721505e-05, + "loss": 0.0023, + "num_tokens": 39800432.0, + "reward": 1.91847825050354, + "reward_std": 0.23057830333709717, + "rewards/fixed_code_pass_all_test_reward/mean": 0.91847825050354, + "rewards/fixed_code_pass_all_test_reward/std": 0.23057828843593597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 249.625, + "completions/mean_terminated_length": 249.625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.8826784726065302, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759765625, + "kl": 0.019371931557543576, + "learning_rate": 1.3707156854104998e-05, + "loss": 0.0008, + "num_tokens": 39805837.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 297.875, + "completions/mean_terminated_length": 297.875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.8828629404168973, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.296875, + "kl": 0.04092761117499322, + "learning_rate": 1.370416625215512e-05, + "loss": 0.0016, + "num_tokens": 39812284.0, + "reward": 1.370192289352417, + "reward_std": 0.17915162444114685, + "rewards/fixed_code_pass_all_test_reward/mean": 0.370192289352417, + "rewards/fixed_code_pass_all_test_reward/std": 0.17915163934230804, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 260.5, + "completions/mean_terminated_length": 260.5, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.8830474082272644, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.23828125, + "kl": 0.052423154236748815, + "learning_rate": 1.3701175266181918e-05, + "loss": 0.0021, + "num_tokens": 39820672.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 275.0, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.8832318760376314, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05517578125, + "kl": 0.03279749350622296, + "learning_rate": 1.3698183896495483e-05, + "loss": 0.0013, + "num_tokens": 39830064.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 191.375, + "completions/mean_terminated_length": 191.375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.8834163438479985, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.04007199499756098, + "learning_rate": 1.369519214340593e-05, + "loss": 0.0016, + "num_tokens": 39834427.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 473.375, + "completions/mean_terminated_length": 473.375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.8836008116583656, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.048756099538877606, + "learning_rate": 1.3692200007223428e-05, + "loss": 0.002, + "num_tokens": 39848150.0, + "reward": 1.5535714626312256, + "reward_std": 0.4056113660335541, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.09438391774892807, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 277.875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.8837852794687328, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.03142055100761354, + "learning_rate": 1.3689207488258185e-05, + "loss": 0.0013, + "num_tokens": 39857261.0, + "reward": 1.668269157409668, + "reward_std": 0.4578319489955902, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6682692170143127, + "rewards/fixed_code_pass_all_test_reward/std": 0.4578319489955902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 377.375, + "completions/mean_terminated_length": 377.375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.8839697472790998, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.86328125, + "kl": 0.019864696776494384, + "learning_rate": 1.3686214586820443e-05, + "loss": 0.0008, + "num_tokens": 39865384.0, + "reward": 1.8125, + "reward_std": 0.3471825420856476, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3471825420856476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 475.125, + "completions/mean_terminated_length": 475.125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.8841542150894669, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0, + "kl": 0.03055708098690957, + "learning_rate": 1.3683221303220486e-05, + "loss": 0.0012, + "num_tokens": 39879409.0, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 251.125, + "completions/mean_terminated_length": 251.125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.884338682899834, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.169921875, + "kl": 0.05331244086846709, + "learning_rate": 1.368022763776864e-05, + "loss": 0.0021, + "num_tokens": 39884410.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 117.875, + "completions/mean_terminated_length": 117.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.884523150710201, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.05202515935525298, + "learning_rate": 1.3677233590775262e-05, + "loss": 0.0021, + "num_tokens": 39888121.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 438.25, + "completions/mean_terminated_length": 438.25, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.8847076185205681, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.02664763364009559, + "learning_rate": 1.3674239162550764e-05, + "loss": 0.0011, + "num_tokens": 39896971.0, + "reward": 1.7083332538604736, + "reward_std": 0.3032888174057007, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7083333730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.30328884720802307, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 245.75, + "completions/mean_terminated_length": 245.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.8848920863309353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0869140625, + "kl": 0.03534804133232683, + "learning_rate": 1.3671244353405582e-05, + "loss": 0.0014, + "num_tokens": 39901801.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 169.625, + "completions/mean_terminated_length": 169.625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.8850765541413024, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.041021901182830334, + "learning_rate": 1.3668249163650197e-05, + "loss": 0.0016, + "num_tokens": 39905934.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 260.625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.8852610219516694, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4453125, + "kl": 0.04544982104562223, + "learning_rate": 1.3665253593595135e-05, + "loss": 0.0018, + "num_tokens": 39912107.0, + "reward": 1.485576868057251, + "reward_std": 0.04079460725188255, + "rewards/fixed_code_pass_all_test_reward/mean": 0.48557692766189575, + "rewards/fixed_code_pass_all_test_reward/std": 0.04079461842775345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 198.625, + "completions/mean_terminated_length": 198.625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.8854454897620365, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07421875, + "kl": 0.031893985520582646, + "learning_rate": 1.3662257643550958e-05, + "loss": 0.0013, + "num_tokens": 39917032.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 319.625, + "completions/mean_terminated_length": 319.625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.8856299575724036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.75, + "kl": 0.026645993581041694, + "learning_rate": 1.365926131382826e-05, + "loss": 0.0011, + "num_tokens": 39925765.0, + "reward": 1.9880952835083008, + "reward_std": 0.022043362259864807, + "rewards/fixed_code_pass_all_test_reward/mean": 0.988095223903656, + "rewards/fixed_code_pass_all_test_reward/std": 0.02204333432018757, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 254.875, + "completions/mean_terminated_length": 254.875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.8858144253827707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.240234375, + "kl": 0.12777129234746099, + "learning_rate": 1.3656264604737683e-05, + "loss": 0.0051, + "num_tokens": 39933828.0, + "reward": 1.9642857313156128, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 423.625, + "completions/mean_terminated_length": 423.625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.8859988931931378, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.034718929207883775, + "learning_rate": 1.3653267516589909e-05, + "loss": 0.0014, + "num_tokens": 39944377.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 394.25, + "completions/mean_terminated_length": 394.25, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.8861833610035049, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.87109375, + "kl": 0.03359021758660674, + "learning_rate": 1.365027004969565e-05, + "loss": 0.0013, + "num_tokens": 39951875.0, + "reward": 1.625, + "reward_std": 0.6697873473167419, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.3874864876270294, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 286.0, + "completions/mean_terminated_length": 286.0, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.886367828813872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.328125, + "kl": 0.04825064749456942, + "learning_rate": 1.364727220436567e-05, + "loss": 0.0019, + "num_tokens": 39960251.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 358.625, + "completions/mean_terminated_length": 358.625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8865522966242391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.053420227486640215, + "learning_rate": 1.364427398091076e-05, + "loss": 0.0021, + "num_tokens": 39974152.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 501.0, + "completions/mean_terminated_length": 501.0, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.8867367644346061, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.984375, + "kl": 0.0364383899141103, + "learning_rate": 1.364127537964176e-05, + "loss": 0.0015, + "num_tokens": 39983392.0, + "reward": 1.2083333730697632, + "reward_std": 0.39591163396835327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.2083333432674408, + "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 325.875, + "completions/mean_terminated_length": 325.875, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.8869212322449732, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040283203125, + "kl": 0.018094782019034028, + "learning_rate": 1.3638276400869544e-05, + "loss": 0.0007, + "num_tokens": 39994471.0, + "reward": 1.5056179761886597, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5056179761886597, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8871057000553404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.0380824850872159, + "learning_rate": 1.3635277044905025e-05, + "loss": 0.0015, + "num_tokens": 40003859.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 1088.875, + "completions/mean_terminated_length": 1088.875, + "completions/min_length": 976.0, + "completions/min_terminated_length": 976.0, + "epoch": 0.8872901678657075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.52734375, + "kl": 0.01806294033303857, + "learning_rate": 1.3632277312059157e-05, + "loss": 0.0007, + "num_tokens": 40024698.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 137.25, + "completions/mean_terminated_length": 137.25, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.8874746356760745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08447265625, + "kl": 0.03752576420083642, + "learning_rate": 1.3629277202642931e-05, + "loss": 0.0015, + "num_tokens": 40028468.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.8876591034864416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12451171875, + "kl": 0.04492254299111664, + "learning_rate": 1.3626276716967382e-05, + "loss": 0.0018, + "num_tokens": 40037122.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 728.0, + "completions/max_terminated_length": 728.0, + "completions/mean_length": 483.0, + "completions/mean_terminated_length": 483.0, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.8878435712968087, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.125, + "kl": 0.0470244197640568, + "learning_rate": 1.3623275855343576e-05, + "loss": 0.0019, + "num_tokens": 40049474.0, + "reward": 1.5, + "reward_std": 0.0890870913863182, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0890870913863182, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 355.875, + "completions/mean_terminated_length": 355.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.8880280391071758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03662109375, + "kl": 0.02145544334780425, + "learning_rate": 1.3620274618082628e-05, + "loss": 0.0009, + "num_tokens": 40059345.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 304.375, + "completions/mean_terminated_length": 304.375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8882125069175429, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046142578125, + "kl": 0.03213256527669728, + "learning_rate": 1.3617273005495682e-05, + "loss": 0.0013, + "num_tokens": 40067396.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 242.375, + "completions/mean_terminated_length": 242.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.88839697472791, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08203125, + "kl": 0.027460610144771636, + "learning_rate": 1.361427101789392e-05, + "loss": 0.0011, + "num_tokens": 40072855.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 480.625, + "completions/mean_terminated_length": 480.625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.8885814425382771, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.63671875, + "kl": 0.06665262184105814, + "learning_rate": 1.361126865558858e-05, + "loss": 0.0027, + "num_tokens": 40082108.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 226.375, + "completions/mean_terminated_length": 226.375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.8887659103486442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2314453125, + "kl": 0.043979566777125, + "learning_rate": 1.3608265918890919e-05, + "loss": 0.0018, + "num_tokens": 40087439.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 373.0, + "completions/mean_terminated_length": 373.0, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.8889503781590112, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0078125, + "kl": 0.02769853570498526, + "learning_rate": 1.3605262808112247e-05, + "loss": 0.0011, + "num_tokens": 40098815.0, + "reward": 1.9076087474822998, + "reward_std": 0.1745910942554474, + "rewards/fixed_code_pass_all_test_reward/mean": 0.907608687877655, + "rewards/fixed_code_pass_all_test_reward/std": 0.1745910793542862, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1222.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 356.375, + "completions/mean_terminated_length": 356.375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.8891348459693783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8046875, + "kl": 0.02794547099620104, + "learning_rate": 1.3602259323563895e-05, + "loss": 0.0011, + "num_tokens": 40104498.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 157.0, + "completions/max_terminated_length": 157.0, + "completions/mean_length": 128.875, + "completions/mean_terminated_length": 128.875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.8893193137797454, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30859375, + "kl": 0.0868347748182714, + "learning_rate": 1.3599255465557257e-05, + "loss": 0.0035, + "num_tokens": 40108385.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 330.125, + "completions/mean_terminated_length": 330.125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.8895037815901126, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0810546875, + "kl": 0.027131236158311367, + "learning_rate": 1.3596251234403747e-05, + "loss": 0.0011, + "num_tokens": 40118082.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 145.0, + "completions/max_terminated_length": 145.0, + "completions/mean_length": 123.625, + "completions/mean_terminated_length": 123.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.8896882494004796, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.1875, + "kl": 0.0660396353341639, + "learning_rate": 1.3593246630414826e-05, + "loss": 0.0026, + "num_tokens": 40121887.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 187.875, + "completions/mean_terminated_length": 187.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.8898727172108467, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.453125, + "kl": 0.06254475214518607, + "learning_rate": 1.3590241653901986e-05, + "loss": 0.0025, + "num_tokens": 40126198.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 278.875, + "completions/mean_terminated_length": 278.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.8900571850212138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.05223834654316306, + "learning_rate": 1.358723630517677e-05, + "loss": 0.0021, + "num_tokens": 40134629.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 161.375, + "completions/mean_terminated_length": 161.375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.8902416528315809, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.021204240154474974, + "learning_rate": 1.3584230584550749e-05, + "loss": 0.0008, + "num_tokens": 40138752.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 376.75, + "completions/mean_terminated_length": 376.75, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.8904261206419479, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.125, + "kl": 0.14607433998025954, + "learning_rate": 1.3581224492335536e-05, + "loss": 0.0058, + "num_tokens": 40150014.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 208.75, + "completions/mean_terminated_length": 208.75, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.8906105884523151, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.05222631571814418, + "learning_rate": 1.3578218028842782e-05, + "loss": 0.0021, + "num_tokens": 40155580.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 255.125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.8907950562626822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.04727358045056462, + "learning_rate": 1.3575211194384182e-05, + "loss": 0.0019, + "num_tokens": 40164237.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 308.875, + "completions/mean_terminated_length": 308.875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.8909795240730493, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.031919239554554224, + "learning_rate": 1.357220398927146e-05, + "loss": 0.0013, + "num_tokens": 40171148.0, + "reward": 1.9318182468414307, + "reward_std": 0.15932266414165497, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9318181872367859, + "rewards/fixed_code_pass_all_test_reward/std": 0.15932264924049377, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 389.375, + "completions/mean_terminated_length": 389.375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.8911639918834163, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.033594183158129454, + "learning_rate": 1.356919641381638e-05, + "loss": 0.0013, + "num_tokens": 40181055.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 274.0, + "completions/mean_terminated_length": 274.0, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.8913484596937834, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5546875, + "kl": 0.036877939477562904, + "learning_rate": 1.3566188468330754e-05, + "loss": 0.0015, + "num_tokens": 40189919.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 225.5, + "completions/mean_terminated_length": 225.5, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.8915329275041505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.05413321126252413, + "learning_rate": 1.3563180153126423e-05, + "loss": 0.0022, + "num_tokens": 40197171.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 249.375, + "completions/mean_terminated_length": 249.375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.8917173953145177, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.28125, + "kl": 0.03542689455207437, + "learning_rate": 1.356017146851527e-05, + "loss": 0.0014, + "num_tokens": 40201886.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 258.25, + "completions/mean_terminated_length": 258.25, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.8919018631248847, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05712890625, + "kl": 0.021938129095360637, + "learning_rate": 1.3557162414809213e-05, + "loss": 0.0009, + "num_tokens": 40207864.0, + "reward": 1.7999999523162842, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.800000011920929, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 272.25, + "completions/mean_terminated_length": 272.25, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8920863309352518, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.03136964561417699, + "learning_rate": 1.3554152992320213e-05, + "loss": 0.0013, + "num_tokens": 40213322.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 278.5, + "completions/mean_terminated_length": 278.5, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.8922707987456189, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09912109375, + "kl": 0.039530866546556354, + "learning_rate": 1.3551143201360266e-05, + "loss": 0.0016, + "num_tokens": 40221710.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 107.375, + "completions/mean_terminated_length": 107.375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.892455266555986, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.080078125, + "kl": 0.03659618191886693, + "learning_rate": 1.354813304224141e-05, + "loss": 0.0015, + "num_tokens": 40225305.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 95.25, + "completions/mean_terminated_length": 95.25, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.892639734366353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.05242172966245562, + "learning_rate": 1.354512251527571e-05, + "loss": 0.0021, + "num_tokens": 40228891.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 224.375, + "completions/mean_terminated_length": 224.375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.8928242021767202, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6640625, + "kl": 0.0479169525206089, + "learning_rate": 1.3542111620775287e-05, + "loss": 0.0019, + "num_tokens": 40233526.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 284.25, + "completions/mean_terminated_length": 284.25, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.8930086699870873, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.07448538532480597, + "learning_rate": 1.3539100359052286e-05, + "loss": 0.003, + "num_tokens": 40243832.0, + "reward": 1.9271653890609741, + "reward_std": 0.2060074657201767, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9271653890609741, + "rewards/fixed_code_pass_all_test_reward/std": 0.2060074806213379, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 181.0, + "completions/mean_terminated_length": 181.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.8931931377974544, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.171875, + "kl": 0.018278325733263046, + "learning_rate": 1.3536088730418897e-05, + "loss": 0.0007, + "num_tokens": 40248128.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 480.125, + "completions/mean_terminated_length": 480.125, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.8933776056078214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034912109375, + "kl": 0.022718505351804197, + "learning_rate": 1.3533076735187341e-05, + "loss": 0.0009, + "num_tokens": 40257889.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 422.125, + "completions/mean_terminated_length": 422.125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.8935620734181885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07861328125, + "kl": 0.0413172859698534, + "learning_rate": 1.3530064373669887e-05, + "loss": 0.0017, + "num_tokens": 40267690.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 410.75, + "completions/mean_terminated_length": 410.75, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.8937465412285556, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12060546875, + "kl": 0.062498676124960184, + "learning_rate": 1.3527051646178832e-05, + "loss": 0.0025, + "num_tokens": 40277504.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 218.25, + "completions/mean_terminated_length": 218.25, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.8939310090389228, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.046875, + "kl": 0.0595300542190671, + "learning_rate": 1.3524038553026519e-05, + "loss": 0.0024, + "num_tokens": 40286122.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 102.5, + "completions/mean_terminated_length": 102.5, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.8941154768492898, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.390625, + "kl": 0.05630644969642162, + "learning_rate": 1.3521025094525323e-05, + "loss": 0.0023, + "num_tokens": 40289726.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 327.75, + "completions/mean_terminated_length": 327.75, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.8942999446596569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12109375, + "kl": 0.05172666581347585, + "learning_rate": 1.3518011270987661e-05, + "loss": 0.0021, + "num_tokens": 40298548.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 443.125, + "completions/mean_terminated_length": 443.125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.894484412470024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9375, + "kl": 0.02979980653617531, + "learning_rate": 1.3514997082725985e-05, + "loss": 0.0012, + "num_tokens": 40306061.0, + "reward": 1.7000000476837158, + "reward_std": 0.46598589420318604, + "rewards/fixed_code_pass_all_test_reward/mean": 0.949999988079071, + "rewards/fixed_code_pass_all_test_reward/std": 0.0534522607922554, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 229.75, + "completions/mean_terminated_length": 229.75, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.894668880280391, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055419921875, + "kl": 0.0531901849899441, + "learning_rate": 1.3511982530052787e-05, + "loss": 0.0021, + "num_tokens": 40313571.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.8948533480907581, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.05117065703961998, + "learning_rate": 1.3508967613280594e-05, + "loss": 0.002, + "num_tokens": 40323746.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 271.625, + "completions/mean_terminated_length": 271.625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.8950378159011253, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1015625, + "kl": 0.02618949592579156, + "learning_rate": 1.3505952332721976e-05, + "loss": 0.001, + "num_tokens": 40332183.0, + "reward": 1.5, + "reward_std": 0.5345224738121033, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.5345224738121033, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 269.625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.8952222837114924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.390625, + "kl": 0.051704831421375275, + "learning_rate": 1.3502936688689534e-05, + "loss": 0.0021, + "num_tokens": 40338308.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 310.25, + "completions/mean_terminated_length": 310.25, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.8954067515218594, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.061279296875, + "kl": 0.02149977011140436, + "learning_rate": 1.3499920681495915e-05, + "loss": 0.0009, + "num_tokens": 40344238.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 322.5, + "completions/mean_terminated_length": 322.5, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.8955912193322265, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05029296875, + "kl": 0.027827743906527758, + "learning_rate": 1.349690431145379e-05, + "loss": 0.0011, + "num_tokens": 40350898.0, + "reward": 1.0612244606018066, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.06122449040412903, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 261.25, + "completions/mean_terminated_length": 261.25, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.8957756871425936, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0400390625, + "kl": 0.022374557447619736, + "learning_rate": 1.3493887578875881e-05, + "loss": 0.0009, + "num_tokens": 40357004.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 242.75, + "completions/mean_terminated_length": 242.75, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.8959601549529607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.04444209625944495, + "learning_rate": 1.3490870484074943e-05, + "loss": 0.0018, + "num_tokens": 40366762.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 303.125, + "completions/mean_terminated_length": 303.125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8961446227633278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.78515625, + "kl": 0.05666601238772273, + "learning_rate": 1.348785302736377e-05, + "loss": 0.0023, + "num_tokens": 40376755.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 177.25, + "completions/mean_terminated_length": 177.25, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.8963290905736949, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059814453125, + "kl": 0.018445522291585803, + "learning_rate": 1.3484835209055188e-05, + "loss": 0.0007, + "num_tokens": 40381229.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 291.875, + "completions/mean_terminated_length": 291.875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.896513558384062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09033203125, + "kl": 0.033602256793528795, + "learning_rate": 1.3481817029462066e-05, + "loss": 0.0013, + "num_tokens": 40387988.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 282.875, + "completions/mean_terminated_length": 282.875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.8966980261944291, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.04112620232626796, + "learning_rate": 1.3478798488897309e-05, + "loss": 0.0016, + "num_tokens": 40396563.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 247.625, + "completions/mean_terminated_length": 247.625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.8968824940047961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041748046875, + "kl": 0.03333612787537277, + "learning_rate": 1.3475779587673859e-05, + "loss": 0.0013, + "num_tokens": 40406296.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 354.25, + "completions/mean_terminated_length": 354.25, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.8970669618151632, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.03400420397520065, + "learning_rate": 1.3472760326104694e-05, + "loss": 0.0014, + "num_tokens": 40414818.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 272.75, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8972514296255304, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08251953125, + "kl": 0.04012254602275789, + "learning_rate": 1.3469740704502835e-05, + "loss": 0.0016, + "num_tokens": 40423656.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 371.0, + "completions/mean_terminated_length": 371.0, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.8974358974358975, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.029997592326253653, + "learning_rate": 1.3466720723181337e-05, + "loss": 0.0012, + "num_tokens": 40431240.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 311.625, + "completions/mean_terminated_length": 311.625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.8976203652462645, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5078125, + "kl": 0.04764627886470407, + "learning_rate": 1.3463700382453281e-05, + "loss": 0.0019, + "num_tokens": 40438333.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 439.75, + "completions/mean_terminated_length": 439.75, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8978048330566316, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.09375, + "kl": 0.04556818585842848, + "learning_rate": 1.346067968263181e-05, + "loss": 0.0018, + "num_tokens": 40447027.0, + "reward": 1.6785714626312256, + "reward_std": 0.3393528461456299, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6785714626312256, + "rewards/fixed_code_pass_all_test_reward/std": 0.3393528461456299, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 286.25, + "completions/mean_terminated_length": 286.25, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8979893008669987, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.04446255601942539, + "learning_rate": 1.3457658624030079e-05, + "loss": 0.0018, + "num_tokens": 40453421.0, + "reward": 1.625, + "reward_std": 0.16690461337566376, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.16690459847450256, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 376.625, + "completions/mean_terminated_length": 376.625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.8981737686773658, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1748046875, + "kl": 0.03523054451216012, + "learning_rate": 1.3454637206961299e-05, + "loss": 0.0014, + "num_tokens": 40459930.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 258.875, + "completions/mean_terminated_length": 258.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.8983582364877329, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.02882067859172821, + "learning_rate": 1.3451615431738703e-05, + "loss": 0.0012, + "num_tokens": 40468585.0, + "reward": 1.7874999046325684, + "reward_std": 0.3234752416610718, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7874999642372131, + "rewards/fixed_code_pass_all_test_reward/std": 0.3234752416610718, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 377.625, + "completions/mean_terminated_length": 377.625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.8985427042981, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.421875, + "kl": 0.05101708578877151, + "learning_rate": 1.3448593298675577e-05, + "loss": 0.002, + "num_tokens": 40481894.0, + "reward": 1.5, + "reward_std": 0.3118034303188324, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.3118034601211548, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 295.25, + "completions/mean_terminated_length": 295.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.8987271721084671, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.042133047012612224, + "learning_rate": 1.3445570808085229e-05, + "loss": 0.0017, + "num_tokens": 40491640.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 359.125, + "completions/mean_terminated_length": 359.125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.8989116399188342, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.453125, + "kl": 0.04411594523116946, + "learning_rate": 1.3442547960281014e-05, + "loss": 0.0018, + "num_tokens": 40501113.0, + "reward": 1.1193182468414307, + "reward_std": 0.048211827874183655, + "rewards/fixed_code_pass_all_test_reward/mean": 0.11931818723678589, + "rewards/fixed_code_pass_all_test_reward/std": 0.048211827874183655, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 347.625, + "completions/mean_terminated_length": 347.625, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.8990961077292012, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0439453125, + "kl": 0.02667079772800207, + "learning_rate": 1.343952475557632e-05, + "loss": 0.0011, + "num_tokens": 40509902.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 175.75, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.8992805755395683, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06884765625, + "kl": 0.0529350230935961, + "learning_rate": 1.343650119428457e-05, + "loss": 0.0021, + "num_tokens": 40514124.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 280.75, + "completions/mean_terminated_length": 280.75, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.8994650433499355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049072265625, + "kl": 0.023814132902771235, + "learning_rate": 1.3433477276719231e-05, + "loss": 0.001, + "num_tokens": 40520562.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1181.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 796.375, + "completions/mean_terminated_length": 796.375, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "epoch": 0.8996495111603026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.404296875, + "kl": 0.017301700892858207, + "learning_rate": 1.3430453003193801e-05, + "loss": 0.0007, + "num_tokens": 40536629.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 303.125, + "completions/mean_terminated_length": 303.125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8998339789706696, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1318359375, + "kl": 0.051614079624414444, + "learning_rate": 1.3427428374021816e-05, + "loss": 0.0021, + "num_tokens": 40545990.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 441.125, + "completions/mean_terminated_length": 441.125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.9000184467810367, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10693359375, + "kl": 0.057898911414667964, + "learning_rate": 1.3424403389516854e-05, + "loss": 0.0023, + "num_tokens": 40558095.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 454.0, + "completions/mean_terminated_length": 454.0, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.9002029145914038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.050357280066236854, + "learning_rate": 1.3421378049992514e-05, + "loss": 0.002, + "num_tokens": 40566423.0, + "reward": 1.7916667461395264, + "reward_std": 0.39591163396835327, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7916666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.39591163396835327, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 563.875, + "completions/mean_terminated_length": 563.875, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.9003873824017709, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025634765625, + "kl": 0.026917697046883404, + "learning_rate": 1.3418352355762456e-05, + "loss": 0.0011, + "num_tokens": 40579886.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 359.0, + "completions/mean_terminated_length": 359.0, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.900571850212138, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.038975171046331525, + "learning_rate": 1.3415326307140355e-05, + "loss": 0.0016, + "num_tokens": 40589726.0, + "reward": 1.53125, + "reward_std": 0.7372426986694336, + "rewards/fixed_code_pass_all_test_reward/mean": 0.65625, + "rewards/fixed_code_pass_all_test_reward/std": 0.48065248131752014, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 364.0, + "completions/mean_terminated_length": 364.0, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.9007563180225051, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049560546875, + "kl": 0.01788379775825888, + "learning_rate": 1.3412299904439939e-05, + "loss": 0.0007, + "num_tokens": 40597942.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 255.875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.9009407858328722, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.7578125, + "kl": 0.0736178532242775, + "learning_rate": 1.3409273147974957e-05, + "loss": 0.0029, + "num_tokens": 40607037.0, + "reward": 1.90625, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 360.875, + "completions/mean_terminated_length": 360.875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.9011252536432393, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.042822302551940084, + "learning_rate": 1.3406246038059208e-05, + "loss": 0.0017, + "num_tokens": 40620228.0, + "reward": 1.6443965435028076, + "reward_std": 0.4907793700695038, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6443965435028076, + "rewards/fixed_code_pass_all_test_reward/std": 0.49077939987182617, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 277.75, + "completions/mean_terminated_length": 277.75, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.9013097214536063, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3125, + "kl": 0.029818235663697124, + "learning_rate": 1.3403218575006522e-05, + "loss": 0.0012, + "num_tokens": 40630490.0, + "reward": 1.7652559280395508, + "reward_std": 0.4358837604522705, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7652559280395508, + "rewards/fixed_code_pass_all_test_reward/std": 0.4358838200569153, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 353.25, + "completions/mean_terminated_length": 353.25, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.9014941892639734, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0546875, + "kl": 0.035602202638983727, + "learning_rate": 1.3400190759130767e-05, + "loss": 0.0014, + "num_tokens": 40641892.0, + "reward": 1.607954502105713, + "reward_std": 0.04821188002824783, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6079545021057129, + "rewards/fixed_code_pass_all_test_reward/std": 0.04821184277534485, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 274.875, + "completions/mean_terminated_length": 274.875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.9016786570743405, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.02627052366733551, + "learning_rate": 1.3397162590745845e-05, + "loss": 0.0011, + "num_tokens": 40652155.0, + "reward": 1.8804347515106201, + "reward_std": 0.22139178216457367, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8804347515106201, + "rewards/fixed_code_pass_all_test_reward/std": 0.22139176726341248, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 193.875, + "completions/mean_terminated_length": 193.875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.9018631248847077, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0751953125, + "kl": 0.02698020508978516, + "learning_rate": 1.3394134070165696e-05, + "loss": 0.0011, + "num_tokens": 40656650.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 395.0, + "completions/mean_terminated_length": 395.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.9020475926950747, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4921875, + "kl": 0.06621546461246908, + "learning_rate": 1.33911051977043e-05, + "loss": 0.0026, + "num_tokens": 40666626.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 381.125, + "completions/mean_terminated_length": 381.125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.9022320605054418, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.390625, + "kl": 0.026237699203193188, + "learning_rate": 1.3388075973675667e-05, + "loss": 0.001, + "num_tokens": 40674627.0, + "reward": 1.8571429252624512, + "reward_std": 0.3499270975589752, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9821428656578064, + "rewards/fixed_code_pass_all_test_reward/std": 0.05050762742757797, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.9024165283158089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.076171875, + "kl": 0.03774978092405945, + "learning_rate": 1.3385046398393851e-05, + "loss": 0.0015, + "num_tokens": 40684477.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 271.875, + "completions/mean_terminated_length": 271.875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.902600996126176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.02628690586425364, + "learning_rate": 1.338201647217293e-05, + "loss": 0.0011, + "num_tokens": 40690876.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 364.75, + "completions/mean_terminated_length": 364.75, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.902785463936543, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.15625, + "kl": 0.039824796142056584, + "learning_rate": 1.3378986195327034e-05, + "loss": 0.0016, + "num_tokens": 40701586.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 272.875, + "completions/mean_terminated_length": 272.875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.9029699317469102, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06689453125, + "kl": 0.02810564578976482, + "learning_rate": 1.337595556817032e-05, + "loss": 0.0011, + "num_tokens": 40709849.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 680.75, + "completions/mean_terminated_length": 680.75, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 0.9031543995572773, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0390625, + "kl": 0.024953900603577495, + "learning_rate": 1.3372924591016988e-05, + "loss": 0.001, + "num_tokens": 40722167.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 952.0, + "completions/max_terminated_length": 952.0, + "completions/mean_length": 690.25, + "completions/mean_terminated_length": 690.25, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "epoch": 0.9033388673676443, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.52734375, + "kl": 0.016722447704523802, + "learning_rate": 1.3369893264181255e-05, + "loss": 0.0007, + "num_tokens": 40734737.0, + "reward": 1.9301470518112183, + "reward_std": 0.09683255106210709, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9301470518112183, + "rewards/fixed_code_pass_all_test_reward/std": 0.09683256596326828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 244.5, + "completions/mean_terminated_length": 244.5, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.9035233351780114, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.07078809663653374, + "learning_rate": 1.3366861587977406e-05, + "loss": 0.0028, + "num_tokens": 40739493.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 145.875, + "completions/mean_terminated_length": 145.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.9037078029883785, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07958984375, + "kl": 0.07265775790438056, + "learning_rate": 1.3363829562719737e-05, + "loss": 0.0029, + "num_tokens": 40743380.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 167.5, + "completions/mean_terminated_length": 167.5, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.9038922707987456, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.150390625, + "kl": 0.057736681774258614, + "learning_rate": 1.3360797188722586e-05, + "loss": 0.0023, + "num_tokens": 40747544.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 279.5, + "completions/mean_terminated_length": 279.5, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.9040767386091128, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0595703125, + "kl": 0.04446895164437592, + "learning_rate": 1.335776446630033e-05, + "loss": 0.0018, + "num_tokens": 40758652.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 237.75, + "completions/mean_terminated_length": 237.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.9042612064194798, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0830078125, + "kl": 0.042834551306441426, + "learning_rate": 1.335473139576739e-05, + "loss": 0.0017, + "num_tokens": 40768530.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 170.625, + "completions/mean_terminated_length": 170.625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.9044456742298469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043212890625, + "kl": 0.02027237555012107, + "learning_rate": 1.3351697977438204e-05, + "loss": 0.0008, + "num_tokens": 40773023.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 248.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.904630142040214, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.78125, + "kl": 0.047518352046608925, + "learning_rate": 1.3348664211627265e-05, + "loss": 0.0019, + "num_tokens": 40780470.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 354.0, + "completions/mean_terminated_length": 354.0, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.904814609850581, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2890625, + "kl": 0.054313099244609475, + "learning_rate": 1.3345630098649084e-05, + "loss": 0.0022, + "num_tokens": 40788590.0, + "reward": 1.4375, + "reward_std": 0.40779241919517517, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4375, + "rewards/fixed_code_pass_all_test_reward/std": 0.40779241919517517, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 353.25, + "completions/mean_terminated_length": 353.25, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.9049990776609481, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0947265625, + "kl": 0.07645459473133087, + "learning_rate": 1.334259563881823e-05, + "loss": 0.0031, + "num_tokens": 40796624.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 578.75, + "completions/mean_terminated_length": 578.75, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.9051835454713153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.89453125, + "kl": 0.03981074463808909, + "learning_rate": 1.3339560832449284e-05, + "loss": 0.0016, + "num_tokens": 40809134.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 333.25, + "completions/mean_terminated_length": 333.25, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.9053680132816824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.99609375, + "kl": 0.059877749998122454, + "learning_rate": 1.3336525679856877e-05, + "loss": 0.0024, + "num_tokens": 40818872.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 196.375, + "completions/mean_terminated_length": 196.375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.9055524810920494, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0927734375, + "kl": 0.027943127555772662, + "learning_rate": 1.333349018135568e-05, + "loss": 0.0011, + "num_tokens": 40825115.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 346.375, + "completions/mean_terminated_length": 346.375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.9057369489024165, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06298828125, + "kl": 0.03194777714088559, + "learning_rate": 1.333045433726039e-05, + "loss": 0.0013, + "num_tokens": 40836198.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 231.375, + "completions/mean_terminated_length": 231.375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.9059214167127836, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19140625, + "kl": 0.05417779728304595, + "learning_rate": 1.332741814788574e-05, + "loss": 0.0022, + "num_tokens": 40841337.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 113.875, + "completions/mean_terminated_length": 113.875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.9061058845231507, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.046875, + "kl": 0.15514911990612745, + "learning_rate": 1.3324381613546505e-05, + "loss": 0.0062, + "num_tokens": 40844968.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 479.875, + "completions/mean_terminated_length": 479.875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.9062903523335178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048828125, + "kl": 0.027204880490899086, + "learning_rate": 1.3321344734557488e-05, + "loss": 0.0011, + "num_tokens": 40858007.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 362.375, + "completions/mean_terminated_length": 362.375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.9064748201438849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.875, + "kl": 0.044925668742507696, + "learning_rate": 1.3318307511233542e-05, + "loss": 0.0018, + "num_tokens": 40865354.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 665.0, + "completions/max_terminated_length": 665.0, + "completions/mean_length": 426.25, + "completions/mean_terminated_length": 426.25, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.906659287954252, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3046875, + "kl": 0.04387883469462395, + "learning_rate": 1.3315269943889536e-05, + "loss": 0.0018, + "num_tokens": 40877548.0, + "reward": 1.8406250476837158, + "reward_std": 0.3204175531864166, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8406250476837158, + "rewards/fixed_code_pass_all_test_reward/std": 0.320417582988739, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 210.25, + "completions/mean_terminated_length": 210.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.9068437557646191, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09423828125, + "kl": 0.04857798223383725, + "learning_rate": 1.3312232032840391e-05, + "loss": 0.0019, + "num_tokens": 40882230.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 230.75, + "completions/mean_terminated_length": 230.75, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.9070282235749861, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054931640625, + "kl": 0.022467744420282543, + "learning_rate": 1.3309193778401055e-05, + "loss": 0.0009, + "num_tokens": 40887916.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 158.5, + "completions/mean_terminated_length": 158.5, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.9072126913853532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3828125, + "kl": 0.057570791454054415, + "learning_rate": 1.3306155180886517e-05, + "loss": 0.0023, + "num_tokens": 40892032.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.9073971591957204, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07666015625, + "kl": 0.034020210383459926, + "learning_rate": 1.3303116240611793e-05, + "loss": 0.0014, + "num_tokens": 40897362.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 325.375, + "completions/mean_terminated_length": 325.375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.9075816270060875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.55078125, + "kl": 0.039389976183883846, + "learning_rate": 1.3300076957891946e-05, + "loss": 0.0016, + "num_tokens": 40906701.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 484.875, + "completions/mean_terminated_length": 484.875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.9077660948164545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9921875, + "kl": 0.032578504644334316, + "learning_rate": 1.3297037333042065e-05, + "loss": 0.0013, + "num_tokens": 40915932.0, + "reward": 1.0833332538604736, + "reward_std": 0.0690065547823906, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0833333358168602, + "rewards/fixed_code_pass_all_test_reward/std": 0.06900656223297119, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 238.625, + "completions/mean_terminated_length": 238.625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.9079505626268216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1015625, + "kl": 0.02605132490862161, + "learning_rate": 1.3293997366377278e-05, + "loss": 0.001, + "num_tokens": 40921753.0, + "reward": 1.53125, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.53125, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 309.375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.9081350304371887, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.171875, + "kl": 0.04977713106200099, + "learning_rate": 1.329095705821275e-05, + "loss": 0.002, + "num_tokens": 40930940.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 334.875, + "completions/mean_terminated_length": 334.875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.9083194982475558, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.40625, + "kl": 0.04050698480568826, + "learning_rate": 1.3287916408863679e-05, + "loss": 0.0016, + "num_tokens": 40940459.0, + "reward": 1.8125, + "reward_std": 0.035355329513549805, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.035355325788259506, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 633.625, + "completions/mean_terminated_length": 633.625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.9085039660579229, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80078125, + "kl": 0.03587809926830232, + "learning_rate": 1.32848754186453e-05, + "loss": 0.0014, + "num_tokens": 40953760.0, + "reward": 1.797619104385376, + "reward_std": 0.3781786859035492, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9226190447807312, + "rewards/fixed_code_pass_all_test_reward/std": 0.20031964778900146, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 170.5, + "completions/mean_terminated_length": 170.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.90868843386829, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.07274649292230606, + "learning_rate": 1.3281834087872882e-05, + "loss": 0.0029, + "num_tokens": 40957852.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 300.5, + "completions/mean_terminated_length": 300.5, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.9088729016786571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.019918264646548778, + "learning_rate": 1.327879241686173e-05, + "loss": 0.0008, + "num_tokens": 40967792.0, + "reward": 1.2411764860153198, + "reward_std": 0.010892018675804138, + "rewards/fixed_code_pass_all_test_reward/mean": 0.24117647111415863, + "rewards/fixed_code_pass_all_test_reward/std": 0.010892000049352646, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 434.0, + "completions/mean_terminated_length": 434.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.9090573694890242, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0859375, + "kl": 0.03044092608615756, + "learning_rate": 1.3275750405927186e-05, + "loss": 0.0012, + "num_tokens": 40978528.0, + "reward": 1.8125, + "reward_std": 0.3720118999481201, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3720119297504425, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 159.5, + "completions/mean_terminated_length": 159.5, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.9092418372993912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06396484375, + "kl": 0.029675901168957353, + "learning_rate": 1.3272708055384624e-05, + "loss": 0.0012, + "num_tokens": 40982740.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 125.375, + "completions/mean_terminated_length": 125.375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9094263051097583, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08837890625, + "kl": 0.09727480821311474, + "learning_rate": 1.3269665365549454e-05, + "loss": 0.0039, + "num_tokens": 40986671.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 169.0, + "completions/max_terminated_length": 169.0, + "completions/mean_length": 141.125, + "completions/mean_terminated_length": 141.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.9096107729201255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.025985549087636173, + "learning_rate": 1.3266622336737123e-05, + "loss": 0.001, + "num_tokens": 40990496.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 390.0, + "completions/mean_terminated_length": 390.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.9097952407304926, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.030854206532239914, + "learning_rate": 1.326357896926311e-05, + "loss": 0.0012, + "num_tokens": 40998160.0, + "reward": 1.8392857313156128, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9642857313156128, + "rewards/fixed_code_pass_all_test_reward/std": 0.10101525485515594, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 420.375, + "completions/mean_terminated_length": 420.375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.9099797085408596, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052734375, + "kl": 0.024414860527031124, + "learning_rate": 1.3260535263442936e-05, + "loss": 0.001, + "num_tokens": 41009483.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 429.75, + "completions/mean_terminated_length": 429.75, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.9101641763512267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7734375, + "kl": 0.026198599487543106, + "learning_rate": 1.3257491219592145e-05, + "loss": 0.001, + "num_tokens": 41022369.0, + "reward": 1.7268518209457397, + "reward_std": 0.4350585341453552, + "rewards/fixed_code_pass_all_test_reward/mean": 0.7268518209457397, + "rewards/fixed_code_pass_all_test_reward/std": 0.4350585341453552, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 282.625, + "completions/mean_terminated_length": 282.625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.9103486441615938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.267578125, + "kl": 0.05025681125698611, + "learning_rate": 1.325444683802633e-05, + "loss": 0.002, + "num_tokens": 41032110.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 341.0, + "completions/mean_terminated_length": 341.0, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.9105331119719609, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1328125, + "kl": 0.03958134073764086, + "learning_rate": 1.3251402119061105e-05, + "loss": 0.0016, + "num_tokens": 41042686.0, + "reward": 1.8164557218551636, + "reward_std": 0.2552356421947479, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8164557218551636, + "rewards/fixed_code_pass_all_test_reward/std": 0.2552356719970703, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 299.25, + "completions/mean_terminated_length": 299.25, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.910717579782328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05322265625, + "kl": 0.025689915637485683, + "learning_rate": 1.3248357063012135e-05, + "loss": 0.001, + "num_tokens": 41050920.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 436.625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.9109020475926951, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.03860212117433548, + "learning_rate": 1.3245311670195108e-05, + "loss": 0.0015, + "num_tokens": 41061181.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.9110865154030622, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.53125, + "kl": 0.04649002104997635, + "learning_rate": 1.3242265940925743e-05, + "loss": 0.0019, + "num_tokens": 41069355.0, + "reward": 0.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 379.375, + "completions/mean_terminated_length": 379.375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.9112709832134293, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.078125, + "kl": 0.0359058219473809, + "learning_rate": 1.323921987551981e-05, + "loss": 0.0014, + "num_tokens": 41077950.0, + "reward": 1.5, + "reward_std": 0.4432026147842407, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.4432026445865631, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 421.25, + "completions/mean_terminated_length": 421.25, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.9114554510237963, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0556640625, + "kl": 0.026673976564779878, + "learning_rate": 1.3236173474293102e-05, + "loss": 0.0011, + "num_tokens": 41090944.0, + "reward": 1.6666667461395264, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.6666666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 287.625, + "completions/mean_terminated_length": 287.625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.9116399188341634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.890625, + "kl": 0.028657105984166265, + "learning_rate": 1.3233126737561448e-05, + "loss": 0.0011, + "num_tokens": 41097573.0, + "reward": 1.9525861740112305, + "reward_std": 0.13410648703575134, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9525861740112305, + "rewards/fixed_code_pass_all_test_reward/std": 0.13410647213459015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 327.375, + "completions/mean_terminated_length": 327.375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.9118243866445306, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4296875, + "kl": 0.030189326615072787, + "learning_rate": 1.3230079665640711e-05, + "loss": 0.0012, + "num_tokens": 41104088.0, + "reward": 1.8229167461395264, + "reward_std": 0.29693377017974854, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8229166269302368, + "rewards/fixed_code_pass_all_test_reward/std": 0.29693374037742615, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 244.375, + "completions/mean_terminated_length": 244.375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.9120088544548977, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08154296875, + "kl": 0.030878696125000715, + "learning_rate": 1.3227032258846799e-05, + "loss": 0.0012, + "num_tokens": 41110715.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 374.875, + "completions/mean_terminated_length": 374.875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.9121933222652647, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0703125, + "kl": 0.03172890481073409, + "learning_rate": 1.3223984517495643e-05, + "loss": 0.0013, + "num_tokens": 41121074.0, + "reward": 1.25, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.25, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 382.0, + "completions/mean_terminated_length": 382.0, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.9123777900756318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0291748046875, + "kl": 0.014575378852896392, + "learning_rate": 1.3220936441903212e-05, + "loss": 0.0006, + "num_tokens": 41129482.0, + "reward": 1.1333333253860474, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.13333334028720856, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 273.75, + "completions/mean_terminated_length": 273.75, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.9125622578859989, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.90625, + "kl": 0.0360656474949792, + "learning_rate": 1.3217888032385508e-05, + "loss": 0.0014, + "num_tokens": 41138448.0, + "reward": 1.4439655542373657, + "reward_std": 0.43076345324516296, + "rewards/fixed_code_pass_all_test_reward/mean": 0.44396552443504333, + "rewards/fixed_code_pass_all_test_reward/std": 0.43076348304748535, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 353.75, + "completions/mean_terminated_length": 353.75, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.912746725696366, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3828125, + "kl": 0.04588445834815502, + "learning_rate": 1.3214839289258574e-05, + "loss": 0.0018, + "num_tokens": 41148582.0, + "reward": 1.9945652484893799, + "reward_std": 0.015371870249509811, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9945652484893799, + "rewards/fixed_code_pass_all_test_reward/std": 0.015371883288025856, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 143.125, + "completions/mean_terminated_length": 143.125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.9129311935067331, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4140625, + "kl": 0.029633078491315246, + "learning_rate": 1.321179021283848e-05, + "loss": 0.0012, + "num_tokens": 41152495.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 277.625, + "completions/mean_terminated_length": 277.625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.9131156613171002, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05908203125, + "kl": 0.027881750371307135, + "learning_rate": 1.3208740803441336e-05, + "loss": 0.0011, + "num_tokens": 41159060.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 309.25, + "completions/mean_terminated_length": 309.25, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.9133001291274673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.060546875, + "kl": 0.03247127577196807, + "learning_rate": 1.3205691061383283e-05, + "loss": 0.0013, + "num_tokens": 41165782.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 454.75, + "completions/mean_terminated_length": 454.75, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.9134845969378343, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.28125, + "kl": 0.05534623563289642, + "learning_rate": 1.3202640986980504e-05, + "loss": 0.0022, + "num_tokens": 41177812.0, + "reward": 1.9456522464752197, + "reward_std": 0.15371885895729065, + "rewards/fixed_code_pass_all_test_reward/mean": 0.945652186870575, + "rewards/fixed_code_pass_all_test_reward/std": 0.15371887385845184, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 471.375, + "completions/mean_terminated_length": 471.375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.9136690647482014, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034912109375, + "kl": 0.02334114466793835, + "learning_rate": 1.31995905805492e-05, + "loss": 0.0009, + "num_tokens": 41186967.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 223.375, + "completions/mean_terminated_length": 223.375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.9138535325585685, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.05870185559615493, + "learning_rate": 1.3196539842405625e-05, + "loss": 0.0023, + "num_tokens": 41193794.0, + "reward": 1.9895833730697632, + "reward_std": 0.029462741687893867, + "rewards/fixed_code_pass_all_test_reward/mean": 0.9895833730697632, + "rewards/fixed_code_pass_all_test_reward/std": 0.029462775215506554, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 219.75, + "completions/mean_terminated_length": 219.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.9140380003689356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04345703125, + "kl": 0.030987567268311977, + "learning_rate": 1.3193488772866055e-05, + "loss": 0.0012, + "num_tokens": 41200448.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 389.0, + "completions/mean_terminated_length": 389.0, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.9142224681793027, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.515625, + "kl": 0.05428264872170985, + "learning_rate": 1.3190437372246807e-05, + "loss": 0.0022, + "num_tokens": 41210088.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 195.25, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.9144069359896698, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1103515625, + "kl": 0.02757007849868387, + "learning_rate": 1.3187385640864227e-05, + "loss": 0.0011, + "num_tokens": 41216994.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 192.875, + "completions/mean_terminated_length": 192.875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.9145914038000369, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6015625, + "kl": 0.020145446644164622, + "learning_rate": 1.3184333579034703e-05, + "loss": 0.0008, + "num_tokens": 41221281.0, + "reward": 1.125, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.125, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 199.625, + "completions/mean_terminated_length": 199.625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.914775871610404, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07080078125, + "kl": 0.03514883737079799, + "learning_rate": 1.3181281187074648e-05, + "loss": 0.0014, + "num_tokens": 41229110.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 268.75, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.914960339420771, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.068359375, + "kl": 0.021690602996386588, + "learning_rate": 1.3178228465300516e-05, + "loss": 0.0009, + "num_tokens": 41234244.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 304.25, + "completions/mean_terminated_length": 304.25, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.9151448072311381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059326171875, + "kl": 0.0181016429560259, + "learning_rate": 1.3175175414028792e-05, + "loss": 0.0007, + "num_tokens": 41245774.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 426.5, + "completions/mean_terminated_length": 426.5, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.9153292750415053, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.03917866060510278, + "learning_rate": 1.3172122033575995e-05, + "loss": 0.0016, + "num_tokens": 41254050.0, + "reward": 1.625, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.625, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 379.625, + "completions/mean_terminated_length": 379.625, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.9155137428518724, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0673828125, + "kl": 0.04402852681232616, + "learning_rate": 1.3169068324258683e-05, + "loss": 0.0018, + "num_tokens": 41264079.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 548.625, + "completions/mean_terminated_length": 548.625, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "epoch": 0.9156982106622394, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0218505859375, + "kl": 0.01146214158507064, + "learning_rate": 1.3166014286393443e-05, + "loss": 0.0005, + "num_tokens": 41274748.0, + "reward": 1.2727272510528564, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.27272728085517883, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 490.375, + "completions/mean_terminated_length": 490.375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.9158826784726065, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.69140625, + "kl": 0.020233368617482483, + "learning_rate": 1.3162959920296895e-05, + "loss": 0.0008, + "num_tokens": 41283831.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 174.625, + "completions/mean_terminated_length": 174.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.9160671462829736, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1533203125, + "kl": 0.04282290278933942, + "learning_rate": 1.3159905226285692e-05, + "loss": 0.0017, + "num_tokens": 41287916.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 249.125, + "completions/mean_terminated_length": 249.125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.9162516140933407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050537109375, + "kl": 0.022994797094725072, + "learning_rate": 1.3156850204676532e-05, + "loss": 0.0009, + "num_tokens": 41294101.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 165.125, + "completions/mean_terminated_length": 165.125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.9164360819037078, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2734375, + "kl": 0.03972340631298721, + "learning_rate": 1.315379485578614e-05, + "loss": 0.0016, + "num_tokens": 41301542.0, + "reward": 1.8801021575927734, + "reward_std": 0.022979410365223885, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8801020383834839, + "rewards/fixed_code_pass_all_test_reward/std": 0.022979410365223885, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 362.375, + "completions/mean_terminated_length": 362.375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.9166205497140749, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0859375, + "kl": 0.03181058750487864, + "learning_rate": 1.3150739179931265e-05, + "loss": 0.0013, + "num_tokens": 41309473.0, + "reward": 1.692307710647583, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.692307710647583, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 301.625, + "completions/mean_terminated_length": 301.625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.916805017524442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09765625, + "kl": 0.05006179539486766, + "learning_rate": 1.3147683177428708e-05, + "loss": 0.002, + "num_tokens": 41320686.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 216.375, + "completions/mean_terminated_length": 216.375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.9169894853348091, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.072265625, + "kl": 0.03785826195962727, + "learning_rate": 1.3144626848595288e-05, + "loss": 0.0015, + "num_tokens": 41328905.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 342.5, + "completions/mean_terminated_length": 342.5, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.9171739531451761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06640625, + "kl": 0.022780447150580585, + "learning_rate": 1.3141570193747873e-05, + "loss": 0.0009, + "num_tokens": 41340749.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 130.875, + "completions/mean_terminated_length": 130.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.9173584209555432, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07763671875, + "kl": 0.029114579083397985, + "learning_rate": 1.313851321320335e-05, + "loss": 0.0012, + "num_tokens": 41344668.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 217.125, + "completions/mean_terminated_length": 217.125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.9175428887659104, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.015625, + "kl": 0.049869046779349446, + "learning_rate": 1.3135455907278647e-05, + "loss": 0.002, + "num_tokens": 41352789.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 240.75, + "completions/mean_terminated_length": 240.75, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.9177273565762775, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2265625, + "kl": 0.0696502416394651, + "learning_rate": 1.3132398276290727e-05, + "loss": 0.0028, + "num_tokens": 41358947.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 711.625, + "completions/mean_terminated_length": 711.625, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "epoch": 0.9179118243866445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.73828125, + "kl": 0.025526192388497293, + "learning_rate": 1.3129340320556587e-05, + "loss": 0.001, + "num_tokens": 41376120.0, + "reward": 1.625, + "reward_std": 0.6943650841712952, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.37796446681022644, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 214.875, + "completions/mean_terminated_length": 214.875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.9180962921970116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.353515625, + "kl": 0.053649333538487554, + "learning_rate": 1.3126282040393252e-05, + "loss": 0.0021, + "num_tokens": 41381671.0, + "reward": 1.5, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.5, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 341.0, + "completions/mean_terminated_length": 341.0, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.9182807600073787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044921875, + "kl": 0.02353702951222658, + "learning_rate": 1.3123223436117782e-05, + "loss": 0.0009, + "num_tokens": 41388783.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 237.125, + "completions/mean_terminated_length": 237.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.9184652278177458, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06005859375, + "kl": 0.02939384605269879, + "learning_rate": 1.3120164508047282e-05, + "loss": 0.0012, + "num_tokens": 41394008.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 374.5, + "completions/mean_terminated_length": 374.5, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.9186496956281129, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.95703125, + "kl": 0.03050274634733796, + "learning_rate": 1.311710525649887e-05, + "loss": 0.0012, + "num_tokens": 41403284.0, + "reward": 1.90625, + "reward_std": 0.0578637570142746, + "rewards/fixed_code_pass_all_test_reward/mean": 0.90625, + "rewards/fixed_code_pass_all_test_reward/std": 0.0578637570142746, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 284.75, + "completions/mean_terminated_length": 284.75, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.91883416343848, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3203125, + "kl": 0.04935234133154154, + "learning_rate": 1.3114045681789716e-05, + "loss": 0.002, + "num_tokens": 41412170.0, + "reward": 1.375, + "reward_std": 0.5175491571426392, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 302.375, + "completions/mean_terminated_length": 302.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.9190186312488471, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.578125, + "kl": 0.05447501584421843, + "learning_rate": 1.3110985784237014e-05, + "loss": 0.0022, + "num_tokens": 41425221.0, + "reward": 1.4791667461395264, + "reward_std": 0.11086282134056091, + "rewards/fixed_code_pass_all_test_reward/mean": 0.4791666865348816, + "rewards/fixed_code_pass_all_test_reward/std": 0.11086282134056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 235.625, + "completions/mean_terminated_length": 235.625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.9192030990592142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1845703125, + "kl": 0.047786006703972816, + "learning_rate": 1.3107925564157997e-05, + "loss": 0.0019, + "num_tokens": 41433234.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 310.875, + "completions/mean_terminated_length": 310.875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.9193875668695812, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.107421875, + "kl": 0.04382905806414783, + "learning_rate": 1.3104865021869923e-05, + "loss": 0.0018, + "num_tokens": 41442713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 268.5, + "completions/mean_terminated_length": 268.5, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.9195720346799483, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.359375, + "kl": 0.03923512948676944, + "learning_rate": 1.310180415769009e-05, + "loss": 0.0016, + "num_tokens": 41451725.0, + "reward": 1.75, + "reward_std": 0.4629100561141968, + "rewards/fixed_code_pass_all_test_reward/mean": 0.75, + "rewards/fixed_code_pass_all_test_reward/std": 0.4629100561141968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 204.875, + "completions/mean_terminated_length": 204.875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.9197565024903155, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.84375, + "kl": 0.16749763232655823, + "learning_rate": 1.3098742971935831e-05, + "loss": 0.0067, + "num_tokens": 41462044.0, + "reward": 1.8888888359069824, + "reward_std": 0.31426966190338135, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8888888955116272, + "rewards/fixed_code_pass_all_test_reward/std": 0.31426966190338135, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 552.25, + "completions/mean_terminated_length": 552.25, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.9199409703006826, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.59765625, + "kl": 0.017881876439787447, + "learning_rate": 1.309568146492451e-05, + "loss": 0.0007, + "num_tokens": 41476670.0, + "reward": 1.1944444179534912, + "reward_std": 0.07856737822294235, + "rewards/fixed_code_pass_all_test_reward/mean": 0.1944444477558136, + "rewards/fixed_code_pass_all_test_reward/std": 0.07856741547584534, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 263.25, + "completions/mean_terminated_length": 263.25, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.9201254381110496, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2578125, + "kl": 0.058120565954595804, + "learning_rate": 1.3092619636973517e-05, + "loss": 0.0023, + "num_tokens": 41484880.0, + "reward": 1.875, + "reward_std": 0.3535533845424652, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 145.0, + "completions/mean_terminated_length": 145.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.9203099059214167, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0732421875, + "kl": 0.0341759700095281, + "learning_rate": 1.3089557488400288e-05, + "loss": 0.0014, + "num_tokens": 41488872.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 0.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 337.125, + "completions/mean_terminated_length": 337.125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.9204943737317838, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.234375, + "kl": 0.1078754581976682, + "learning_rate": 1.3086495019522285e-05, + "loss": 0.0043, + "num_tokens": 41497321.0, + "reward": 1.125, + "reward_std": 0.8345229625701904, + "rewards/fixed_code_pass_all_test_reward/mean": 0.375, + "rewards/fixed_code_pass_all_test_reward/std": 0.5175492167472839, + "rewards/format_reward/mean": 0.75, + "rewards/format_reward/std": 0.4629100561141968, + "step": 4990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 307.125, + "completions/mean_terminated_length": 307.125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.9206788415421508, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.265625, + "kl": 0.023799549555405974, + "learning_rate": 1.3083432230657006e-05, + "loss": 0.001, + "num_tokens": 41504170.0, + "reward": 1.8379629850387573, + "reward_std": 0.06547286361455917, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8379629850387573, + "rewards/fixed_code_pass_all_test_reward/std": 0.06547285616397858, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 270.25, + "completions/mean_terminated_length": 270.25, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.920863309352518, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.181640625, + "kl": 0.059338051825761795, + "learning_rate": 1.3080369122121974e-05, + "loss": 0.0024, + "num_tokens": 41510212.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 262.75, + "completions/mean_terminated_length": 262.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.9210477771628851, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14453125, + "kl": 0.054949962766841054, + "learning_rate": 1.307730569423476e-05, + "loss": 0.0022, + "num_tokens": 41518826.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 1099.625, + "completions/mean_terminated_length": 1099.625, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "epoch": 0.9212322449732522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27734375, + "kl": 0.008876230422174558, + "learning_rate": 1.3074241947312954e-05, + "loss": 0.0004, + "num_tokens": 41545151.0, + "reward": 1.89673912525177, + "reward_std": 0.2920658588409424, + "rewards/fixed_code_pass_all_test_reward/mean": 0.89673912525177, + "rewards/fixed_code_pass_all_test_reward/std": 0.29206582903862, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.9214167127836193, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15234375, + "kl": 0.04390677623450756, + "learning_rate": 1.3071177881674189e-05, + "loss": 0.0018, + "num_tokens": 41552346.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 274.125, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.9216011805939863, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.5390625, + "kl": 0.06578104849904776, + "learning_rate": 1.306811349763612e-05, + "loss": 0.0026, + "num_tokens": 41561491.0, + "reward": 1.8849999904632568, + "reward_std": 0.32526910305023193, + "rewards/fixed_code_pass_all_test_reward/mean": 0.8849999904632568, + "rewards/fixed_code_pass_all_test_reward/std": 0.3252691328525543, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 691.125, + "completions/mean_terminated_length": 691.125, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.9217856484043534, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.80859375, + "kl": 0.019629077869467437, + "learning_rate": 1.306504879551645e-05, + "loss": 0.0008, + "num_tokens": 41578516.0, + "reward": 1.59375, + "reward_std": 0.2651650309562683, + "rewards/fixed_code_pass_all_test_reward/mean": 0.59375, + "rewards/fixed_code_pass_all_test_reward/std": 0.2651650309562683, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 385.5, + "completions/mean_terminated_length": 385.5, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.9219701162147206, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0625, + "kl": 0.026673778309486806, + "learning_rate": 1.3061983775632904e-05, + "loss": 0.0011, + "num_tokens": 41586336.0, + "reward": 1.8953487873077393, + "reward_std": 0.03288870304822922, + "rewards/fixed_code_pass_all_test_reward/mean": 0.895348846912384, + "rewards/fixed_code_pass_all_test_reward/std": 0.03288870304822922, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 4998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 386.25, + "completions/mean_terminated_length": 386.25, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.9221545840250877, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.875, + "kl": 0.05683528119698167, + "learning_rate": 1.3058918438303245e-05, + "loss": 0.0023, + "num_tokens": 41600298.0, + "reward": 1.75, + "reward_std": 0.7071067690849304, + "rewards/fixed_code_pass_all_test_reward/mean": 0.875, + "rewards/fixed_code_pass_all_test_reward/std": 0.3535533845424652, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.3535533845424652, + "step": 4999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 288.75, + "completions/mean_terminated_length": 288.75, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.9223390518354547, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.96875, + "kl": 0.1047405373537913, + "learning_rate": 1.305585278384526e-05, + "loss": 0.0042, + "num_tokens": 41608136.0, + "reward": 2.0, + "reward_std": 0.0, + "rewards/fixed_code_pass_all_test_reward/mean": 1.0, + "rewards/fixed_code_pass_all_test_reward/std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "step": 5000 + } + ], + "logging_steps": 1, + "max_steps": 10842, + "num_input_tokens_seen": 41608136, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}